In [15]:
import os, re
import xml.etree.ElementTree as ET
import pandas as pd

In [16]:
dataset = {
    'filename': [],
    'size': [],
    'defect': [],
    'xmin': [],
    'xmax': [],
    'ymin': [],
    'ymax': []
}

folder_path = 'data/Annotations_all/'
# iterate over all files in folder
for filename in os.listdir(folder_path):
    tree = ET.parse(os.path.join(folder_path, filename))
    # create a row for each 'object' Element, i.e. for each single defect
    for obj in [node for node in list(tree.iter()) if node.tag == 'object']:
        for node in obj:
            if node.tag == 'name':
                dataset['defect'] += [node.text]
            if node.tag == 'bndbox':
                for child in node:
                    # use a regular expression to match all bounding element tags
                    if re.compile(r'^(x|y)(min|max)').match(child.tag):
                        dataset[child.tag] += [int(child.text)]    
        # writing the data which is identical for each occurence of 'object' in one file
        for node in tree.iter():
            # each filename can appear more than once if the PCB has more than one defect
            # so it is not viable as row ID
            if node.tag == 'filename':
                dataset['filename'] += [node.text]
            if node.tag == 'size':
                # size as 3-tuple
                dataset['size'] += [(int(node[0].text),
                                    int(node[1].text),
                                    int(node[2].text))]        



In [18]:
df = pd.DataFrame(dataset) 

display(df.head(10))

print(df.info())


Unnamed: 0,filename,size,defect,xmin,xmax,ymin,ymax
0,l_light_11_spur_08_5_600,"(600, 600, 3)",spur,541,594,519,552
1,l_light_11_spur_08_5_600,"(600, 600, 3)",spur,319,362,174,204
2,light_05_spurious_copper_10_2_256,"(600, 600, 3)",spurious_copper,129,194,305,327
3,light_05_spurious_copper_10_2_256,"(600, 600, 3)",spurious_copper,132,187,501,556
4,light_05_spur_02_3_256,"(600, 600, 3)",spur,147,182,401,427
5,light_06_mouse_bite_04_1_256,"(600, 600, 3)",mouse_bite,529,561,263,278
6,light_06_mouse_bite_04_1_256,"(600, 600, 3)",mouse_bite,406,420,557,587
7,light_06_mouse_bite_04_1_256,"(600, 600, 3)",mouse_bite,399,424,462,475
8,light_04_spur_04_4_256,"(600, 600, 3)",spur,137,165,136,176
9,rotation_270_light_07_spurious_copper_02_1_600,"(601, 601, 3)",spurious_copper,87,119,533,599


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21664 entries, 0 to 21663
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  21664 non-null  object
 1   size      21664 non-null  object
 2   defect    21664 non-null  object
 3   xmin      21664 non-null  int64 
 4   xmax      21664 non-null  int64 
 5   ymin      21664 non-null  int64 
 6   ymax      21664 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.2+ MB
None


In [19]:
df.to_csv('PCB_annotations_dataset_tupled_size.csv', sep=';', index=False)