In [59]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [60]:
xmlTrainFilesList = glob("Pothole.v1-raw.voc/train/*.xml")
xmlTestFilesList = glob("Pothole.v1-raw.voc/test/*.xml")

In [61]:
def parse_xml(filename):
    # Parse the XML file
    tree = et.parse(filename)
    root = tree.getroot()
    
    # Extract file name
    file_name = root.find('filename').text
    
    # Extract image size
    width = int(root.find('size/width').text)
    height = int(root.find('size/height').text)
    
    # Extract object information
    objects = []
    for obj in root.findall('object'):
        obj_name = obj.find('name').text
        xmin = int(obj.find('bndbox/xmin').text)
        xmax = int(obj.find('bndbox/xmax').text)
        ymin = int(obj.find('bndbox/ymin').text)
        ymax = int(obj.find('bndbox/ymax').text)
        objects.append({
            'name': obj_name,
            'xmin': xmin,
            'xmax': xmax,
            'ymin': ymin,
            'ymax': ymax
        })
    
    return {
        'file_name': file_name,
        'size': {'width': width, 'height': height},
        'objects': objects
    }

# Example usage
filename = 'Pothole.v1-raw.voc/train/img-476_jpg.rf.b7bf5047096c34127a33e35decef5680.xml'
data = parse_xml(filename)
print("File Name:", data['file_name'])
print("Size:", data['size'])
print("Objects:")
for obj in data['objects']:
    print(obj)


File Name: img-476_jpg.rf.b7bf5047096c34127a33e35decef5680.jpg
Size: {'width': 450, 'height': 300}
Objects:
{'name': 'pothole', 'xmin': 64, 'xmax': 131, 'ymin': 155, 'ymax': 185}
{'name': 'pothole', 'xmin': 287, 'xmax': 321, 'ymin': 135, 'ymax': 150}
{'name': 'pothole', 'xmin': 297, 'xmax': 338, 'ymin': 176, 'ymax': 205}
{'name': 'pothole', 'xmin': 259, 'xmax': 325, 'ymin': 232, 'ymax': 274}


In [62]:
patholesTrainDataList = []
patholesTestDataList = []
for filename in xmlTrainFilesList:
    patholesTrainDataList.append(parse_xml(filename))

for filename in xmlTestFilesList:
    patholesTestDataList.append(parse_xml(filename))


In [63]:
import pandas as pd

train_data_list = []
test_data_list = []
for data in patholesTrainDataList:
    filename = data['file_name']
    size = data['size']
    width = size['width']
    height = size['height']
    objs = data['objects']
    for obj in objs:
        name = obj['name']
        xmin = obj['xmin']
        xmax = obj['xmax']
        ymin = obj['ymin']
        ymax = obj['ymax']
        train_data_list.append([filename, width, height, name, xmin, xmax, ymin, ymax])

for data in patholesTestDataList:
    filename = data['file_name']
    size = data['size']
    width = size['width']
    height = size['height']
    objs = data['objects']
    for obj in objs:
        name = obj['name']
        xmin = obj['xmin']
        xmax = obj['xmax']
        ymin = obj['ymin']
        ymax = obj['ymax']
        test_data_list.append([filename, width, height, name, xmin, xmax, ymin, ymax])


# Define column names
columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax']

# Create DataFrame
train_data_list_frame = pd.DataFrame(train_data_list, columns=columns)
test_data_list_frame = pd.DataFrame(test_data_list, columns=columns)
train_data_list_frame.head(10)


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,img-476_jpg.rf.b7bf5047096c34127a33e35decef568...,450,300,pothole,64,131,155,185
1,img-476_jpg.rf.b7bf5047096c34127a33e35decef568...,450,300,pothole,287,321,135,150
2,img-476_jpg.rf.b7bf5047096c34127a33e35decef568...,450,300,pothole,297,338,176,205
3,img-476_jpg.rf.b7bf5047096c34127a33e35decef568...,450,300,pothole,259,325,232,274
4,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,1,59,15,54
5,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,56,138,1,16
6,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,139,297,1,41
7,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,334,497,1,42
8,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,236,355,50,102
9,img-530_jpg.rf.cbb192f42b0fcae349cd1e320c1ab74...,509,300,pothole,91,217,93,143


In [64]:
train_data_list_frame.shape
test_data_list_frame.shape

(154, 8)

In [65]:
train_data_list_frame.info()
test_data_list_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256 entries, 0 to 1255
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1256 non-null   object
 1   width     1256 non-null   int64 
 2   height    1256 non-null   int64 
 3   name      1256 non-null   object
 4   xmin      1256 non-null   int64 
 5   xmax      1256 non-null   int64 
 6   ymin      1256 non-null   int64 
 7   ymax      1256 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 78.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  154 non-null    object
 1   width     154 non-null    int64 
 2   height    154 non-null    int64 
 3   name      154 non-null    object
 4   xmin      154 non-null    int64 
 5   xmax      154 non-null    int64 
 6   ymin      154 non-null    int64 
 7   ymax      15

In [66]:
train_data_list_frame["centerX"] = ((train_data_list_frame['xmax'] + train_data_list_frame['xmin'])/2)/train_data_list_frame['width']
train_data_list_frame["centerY"] = ((train_data_list_frame['ymax'] + train_data_list_frame['ymin'])/2)/train_data_list_frame['height']

train_data_list_frame["w"] = (train_data_list_frame['xmax'] - train_data_list_frame['xmin'])/train_data_list_frame['width']
train_data_list_frame["h"] = (train_data_list_frame['ymax'] - train_data_list_frame['ymin'])/train_data_list_frame['height']

train_data_list_frame.head()


test_data_list_frame["centerX"] = ((test_data_list_frame['xmax'] + test_data_list_frame['xmin'])/2)/test_data_list_frame['width']
test_data_list_frame["centerY"] = ((test_data_list_frame['ymax'] + test_data_list_frame['ymin'])/2)/test_data_list_frame['height']

test_data_list_frame["w"] = (test_data_list_frame['xmax'] - test_data_list_frame['xmin'])/test_data_list_frame['width']
test_data_list_frame["h"] = (test_data_list_frame['ymax'] - test_data_list_frame['ymin'])/test_data_list_frame['height']

test_data_list_frame.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centerX,centerY,w,h
0,img-269_jpg.rf.f51d9eb8d02a34ac01d4a486cbfbdd4...,720,720,pothole,278,386,323,435,0.461111,0.526389,0.15,0.155556
1,img-269_jpg.rf.f51d9eb8d02a34ac01d4a486cbfbdd4...,720,720,pothole,602,661,497,553,0.877083,0.729167,0.081944,0.077778
2,img-398_jpg.rf.0c484369fdb23fdec1b9250477fc5d1...,684,300,pothole,31,607,40,300,0.466374,0.566667,0.842105,0.866667
3,img-394_jpg.rf.2182e193f33ed5bcce45df7df27032f...,450,300,pothole,1,139,150,235,0.155556,0.641667,0.306667,0.283333
4,img-394_jpg.rf.2182e193f33ed5bcce45df7df27032f...,450,300,pothole,131,433,2,138,0.626667,0.233333,0.671111,0.453333


In [67]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj__train = train_data_list_frame.groupby('filename')

groupby_obj__train


cols = ['filename','id','center_x','center_y','w','h']
groupby_obj__train = train_data_list_frame.groupby('filename')


#----------------------------------------------------------------

cols = ['filename','id','center_x','center_y','w','h']
groupby_obj__test = test_data_list_frame.groupby('filename')

groupby_obj__test


cols = ['filename','id','center_x','center_y','w','h']
groupby_obj__test = test_data_list_frame.groupby('filename')



In [68]:
def save_data(filename, folder_path, group_obj):
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False, header=False)

In [69]:
filename_series_train=pd.Series(groupby_obj__train.groups.keys())
filename_series_test=pd.Series(groupby_obj__test.groups.keys())

In [70]:
filename_series_train.apply(save_data,args=('Pothole.v1-raw.voc/train',groupby_obj__train))
filename_series_test.apply(save_data,args=('Pothole.v1-raw.voc/test',groupby_obj__test))

0     None
1     None
2     None
3     None
4     None
      ... 
62    None
63    None
64    None
65    None
66    None
Length: 67, dtype: object