In [41]:
"""
This script initializes a dataset according to the user inputs. The user can choose one or more of the defects 
to be included in the dataset including the number of samples per defect. 

In order for the script to work correctly, the folder containing this file should have the following subfolders:

data
|__Annotations
|__Images
|__Images_bb
|__Pixel_masks

In order to not push the 'data' subfolder to the remote repo, 
the working directory should have the file '.gitignore' with the line '/data/'.

Furthermore, the parent directory of the working directory should contain the folders where the complete Data is downloaded:

extract_dir = os.path.expanduser("~/Data_science")

VOC_PCB
|__Annotations
|__JPEGImages

So the folder structure should look like this, where 'PCB_DEFECTS_DETECTION/notebooks' is the folder containing this python file:

PCB_DEFECTS_DETECTION
|__data
|  |__Annotations
|  |__Images
|  |__Images_bb
|  |__Pixel_masks
|
|__notebooks
|  |__Initialize_dataset.ipynb


The script then does the following:
1) it deletes the contents of the subfolders 'Annotations', 'Images', 'Images_bb' and 'Pixel_masks'
2) it copies a random choice of samples of the chosen defect types and respective size into the folder '/data/Images/'
3) it copies the according annotation xml files into the folder '/data/Annotations/'
4) it generates a corresponding csv file 'PCB_annotations_dataset.csv' in the working folder with one row for each defect instance, 
   i.e. multiple rows per image
5) for each image in the dataset it generates an image with the drawn bounding boxes around the defects in the folder '/data/Images_bb/'
6) for each image in the dataset it generates an a pixel mask (the label) which is white on the defect locations and black otherwise 
   in the folder '/data/Pixel_masks/'
"""

'\nThis script initializes a dataset according to the user inputs. The user can choose one or more of the defects \nto be included in the dataset including the number of samples per defect. \n\nIn order for the script to work correctly, the folder containing this file should have the following subfolders:\n\ndata\n|__Annotations\n|__Images\n|__Images_bb\n|__Pixel_masks\n\nIn order to not push the \'data\' subfolder to the remote repo, \nthe working directory should have the file \'.gitignore\' with the line \'/data/\'.\n\nFurthermore, the parent directory of the working directory should contain the folders where the complete Data is downloaded:\n\nextract_dir = os.path.expanduser("~/Data_science")\n\nVOC_PCB\n|__Annotations\n|__JPEGImages\n\nSo the folder structure should look like this, where \'PCB_DEFECTS_DETECTION/notebooks\' is the folder containing this python file:\n\nPCB_DEFECTS_DETECTION\n|__data\n|  |__Annotations\n|  |__Images\n|  |__Images_bb\n|  |__Pixel_masks\n|\n|__notebo

In [42]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import cv2, os, re
import shutil

In [43]:
# function definitions
def draw_bounding_boxes(df, filename, img_path):
    """
    draws the bounding box into the PCB image and saves it in the folder 'data/Images_bb'.

    returns: 
        the image with the bounding boxes (numpy.ndarray)
        the image file name (string)

    Args:
        df (pandas.DataFrame): a PCB-annotation DataFrame 
        filename (string): the filename of the PCB image with file type ending
        img_path (string): the relative path to the folder containing the image(s)
    """
    if filename in os.listdir(img_path):
        # Read one image according to the path and filename
        img = cv2.imread(os.path.join(img_path, filename))
        
        # Filter DataFrame based on filename
        file_name = os.path.splitext(filename)[0]
        pcb = df[df['filename'].str.startswith(file_name)]
        
        # For each defect, draw a red frame along the border of the bounding box
        for index, row in pcb.iterrows():
            xmin = int(row['xmin'])
            ymin = int(row['ymin'])
            xmax = int(row['xmax'])
            ymax = int(row['ymax'])
            
            # Draw bounding box
            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)
        
        return img, filename
    else:
        print(f"Image {filename} not found in {img_path}")

In [44]:
def generate_pixel_matrix(df, filename):
    """
    generates a black image of the same shape as the passed PCB image, with white white pixels 
    exactly inside the defect bounding box(es) of the passed PCB image 
    returns:
        the pixel matrix (numpy.ndarray)
        filename without ending (string)

    Args:
        df (pandas.DataFrame): a PCB-annotation DataFrame 
        filename (string): the filename of the PCB image without file type ending
    """
    df_grouped = df.groupby('filename')
    # create a dataframe for each annotation file with as many rows as there are defects
    pcb = df_grouped.get_group(filename)
    # create a width x height marix of zeros, i.e. black pixels
    mask = np.zeros((pcb['width'].iloc[0], pcb['height'].iloc[0]))
    # for each defect set the pixels inside the retrieved bounding box to white
    for row in range(pcb.shape[0]):
        for i in range(pcb.ymin.iloc[row], pcb.ymax.iloc[row]+1):
            for j in range(pcb.xmin.iloc[row], pcb.xmax.iloc[row]+1):
                mask[i][j] = 255
    return(mask, filename)

In [45]:
def get_user_choice():
    defects = {1: 'missing_hole', 2: 'mouse_bite', 3: 'open_circuit', 4: 'short', 5: 'spur', 6: 'spurious_copper'}
    user_input = ''
    while not (re.compile(r"^(?!.*(\d).*\1)[1-6](?: [1-6](?!.*\1)){0,5}$").match(str.strip(user_input))): 
        user_input = input(f'Please select one or more defect types from:\n{defects}\n(separated by blank spaces, no duplicates):')
    chosen_defects = list(map(lambda x: defects[int(x)], str.split(str.strip(user_input), ' ')))
    user_input = ''
    while not (re.compile(r"\d{1,3}").match(str.strip(user_input))):
        user_input = input('How many images per defect (integer, max. 999)? ')
    chosen_size = int(str.strip(user_input))
    return(chosen_defects, chosen_size)


In [46]:
def clear_subfolders(image_dest_path, annot_dest_path, bb_path, mask_path, csv):
    # clearing subfolders 'Annotations', 'Images', 'Images_bb', 'Pixel_masks'
    for filename in os.listdir(image_dest_path):
        os.remove(os.path.join(image_dest_path,filename))
    for filename in os.listdir(annot_dest_path):
        os.remove(os.path.join(annot_dest_path,filename))
    for filename in os.listdir(bb_path):
        os.remove(os.path.join(bb_path, filename))
    for filename in os.listdir(mask_path):
        os.remove(os.path.join(mask_path, filename))
    for filename in os.listdir(csv):
        os.remove(os.path.join(csv, filename))


In [47]:
def copy_samples(chosen_defects, chosen_size, image_pool_path, image_dest_path, annot_pool_path, annot_dest_path):
    # selecting only the images with the chosen defects
    pool = {}
    print(f"Picking from {chosen_defects}")

    for defect_name in chosen_defects:
        pool[defect_name]=[]
        for filename in os.listdir(image_pool_path):        
            if defect_name in filename:
                #print(defect_name , filename)
                pool[defect_name].append(filename)
    
    for defect in pool.keys():
        rnd_picks = np.random.choice(pool[defect], min(len(pool[defect]), chosen_size), replace=False)

        for filename in rnd_picks:
            #print(filename)
            #os.system(f"cp {image_pool_path}{filename} {image_dest_path}")
            image_source = os.path.join(image_pool_path, filename)
            image_destination = os.path.join(image_dest_path, filename)
            shutil.copy(image_source, image_destination)

            #os.system(f"cp {annot_pool_path}{filename[:-4]}.xml {annot_dest_path}")
            xml_filename = filename[:-4] + ".xml"
            xml_source = os.path.join(annot_pool_path, xml_filename)
            xml_destination = os.path.join(annot_dest_path, xml_filename)
            shutil.copy(xml_source, xml_destination)


In [48]:
def generate_PCB_csv(annotation_path, csv_path):
    dataset = {
    'filename': [],
    'width': [],
    'height': [],
    'depth': [],
    'defect': [],
    'xmin': [],
    'xmax': [],
    'ymin': [],
    'ymax': []
    }

    # iterate over all files in folder
    for filename in os.listdir(annotation_path):
        
        tree = ET.parse(os.path.join(annotation_path, filename))
        # create a row for each 'object' Element, i.e. for each single defect
        for obj in [node for node in list(tree.iter()) if node.tag == 'object']:
            for node in obj:
                if node.tag == 'name':
                    dataset['defect'] += [node.text]
                if node.tag == 'bndbox':
                    for child in node:
                        # use a regular expression to match all bounding element tags
                        if re.compile(r'^(x|y)(min|max)').match(child.tag):
                            dataset[child.tag] += [int(child.text)]    
            # writing the data which is identical for each occurence of 'object' in one file
            for node in tree.iter():
                # each filename can appear more than once if the PCB has more than one defect
                # so it is not viable as row ID
                if node.tag == 'filename':
                    dataset[node.tag] += [node.text]
                if node.tag in ['width', 'height', 'depth']:
                    dataset[node.tag] += [int(node.text)]
                        
    df = pd.DataFrame(dataset) 

    df.to_csv(f'{csv_path}\\PCB_annotations_dataset.csv', sep=';', index=False)
    return(df)

In [49]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
working_path = os.path.join(parent_dir) #incase directory needs to be expanded
downloads_path = os.path.expanduser("~\\Downloads")
extract_dir = os.path.expanduser("~\\Data_science")

In [50]:
# Set up folder paths
# Define the source paths for images and annotations
image_pool_path = os.path.join(extract_dir, 'VOC_PCB', 'JPEGImages')
annot_pool_path = os.path.join(extract_dir, 'VOC_PCB', 'Annotations')

# Define the destination path for images and annotations
image_dest_path = os.path.join(working_path, 'data', 'Images')
annot_dest_path = os.path.join(working_path, 'data', 'Annotations')

# Define the destination path for bboxes and masks
bb_path = os.path.join(working_path, 'data', 'Images_bb')
mask_path = os.path.join(working_path, 'data', 'Pixel_masks')

# Define the destination path for csv file
csv_path = os.path.join(working_path, 'data', 'csv')
    

In [51]:
if not os.path.exists(image_dest_path):
    os.makedirs(image_dest_path)
    print("Directory created successfully:", image_dest_path)
else:
    print("Directory already exists:")

if not os.path.exists(annot_dest_path):
    os.makedirs(annot_dest_path)
    print("Directory created successfully:", annot_dest_path)
else:
    print("Directory already exists:")

if not os.path.exists(bb_path):
    os.makedirs(bb_path)
    print("Directory created successfully:", bb_path)
else:
    print("Directory already exists:")

if not os.path.exists(mask_path):
    os.makedirs(mask_path)
    print("Directory created successfully:", mask_path)
else:
    print("Directory already exists:")

if not os.path.exists(csv_path):
    os.makedirs(csv_path)
    print("Directory created successfully:", csv_path)
else:
    print("Directory already exists:")

Directory already exists:
Directory already exists:
Directory already exists:
Directory already exists:
Directory already exists:


In [52]:
img_path = image_dest_path # only for more intuitive variable names later on
annotation_path = annot_dest_path # only for more intuitive variable names later on

In [53]:
clear_subfolders(image_dest_path, annot_dest_path, bb_path, mask_path, csv_path)

In [54]:
chosen_defects, chosen_size = get_user_choice()

In [55]:
print(chosen_defects)

['missing_hole', 'mouse_bite', 'open_circuit', 'short', 'spur', 'spurious_copper']


In [56]:
copy_samples(chosen_defects, chosen_size, image_pool_path, image_dest_path, annot_pool_path, annot_dest_path)

Picking from ['missing_hole', 'mouse_bite', 'open_circuit', 'short', 'spur', 'spurious_copper']


In [57]:
df = generate_PCB_csv(annotation_path, csv_path)

In [58]:
# call the function draw_bounding_boxes once for each image
df = pd.read_csv(f'{csv_path}\\PCB_annotations_dataset.csv', sep=';')

#print(df.head(10))
for filename in os.listdir(img_path):
    img, filename = draw_bounding_boxes(df, filename, img_path)
    cv2.imwrite(f"{bb_path}\\bb-{filename}", img)

# call the function generate_pixel_matrix once for each PCB in the annotation file
for filename in df.filename.unique():
    mask, filename = generate_pixel_matrix(df, filename)
    cv2.imwrite(f"{mask_path}\\pm-{filename}.png", mask)