In [25]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [26]:
# load all xml files and store in a list
xmlfiles = glob('./data_images/*.xml')

In [27]:
xmlfiles[0]

'./data_images/002244.xml'

In [28]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [29]:
parser_all = list(map(extract_text,xmlfiles))

In [30]:
data = reduce(lambda x, y : x+y,parser_all)

In [31]:
# parser_all
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [32]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,002244.jpg,500,333,car,317,465,215,292
1,002244.jpg,500,333,car,107,145,227,259
2,002244.jpg,500,333,car,144,201,233,266
3,002244.jpg,500,333,car,72,113,231,254
4,002244.jpg,500,333,car,200,274,227,263


In [33]:
df.shape

(15663, 8)

In [34]:
df['name'].value_counts()

name
person         5447
car            1650
chair          1427
bottle          634
pottedplant     625
bird            599
dog             538
sofa            425
bicycle         418
horse           406
boat            398
motorbike       390
cat             389
tvmonitor       367
cow             356
sheep           353
aeroplane       331
train           328
diningtable     310
bus             272
Name: count, dtype: int64

#### Conversion
![Conversion.png](Conversion.png)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  object
 2   height    15663 non-null  object
 3   name      15663 non-null  object
 4   xmin      15663 non-null  object
 5   xmax      15663 non-null  object
 6   ymin      15663 non-null  object
 7   ymax      15663 non-null  object
dtypes: object(8)
memory usage: 979.1+ KB


In [36]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  int64 
 2   height    15663 non-null  int64 
 3   name      15663 non-null  object
 4   xmin      15663 non-null  int64 
 5   xmax      15663 non-null  int64 
 6   ymin      15663 non-null  int64 
 7   ymax      15663 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 979.1+ KB


In [37]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [38]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,002244.jpg,500,333,car,317,465,215,292,0.782,0.761261,0.296,0.231231
1,002244.jpg,500,333,car,107,145,227,259,0.252,0.72973,0.076,0.096096
2,002244.jpg,500,333,car,144,201,233,266,0.345,0.749249,0.114,0.099099
3,002244.jpg,500,333,car,72,113,231,254,0.185,0.728228,0.082,0.069069
4,002244.jpg,500,333,car,200,274,227,263,0.474,0.735736,0.148,0.108108


### split data into train and test

In [39]:
images = df['filename'].unique()

In [40]:
len(images)

5012

In [41]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [45]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [46]:
len(img_train), len(img_test)

(4010, 1002)

In [47]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [48]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,002244.jpg,500,333,car,317,465,215,292,0.782,0.761261,0.296,0.231231
1,002244.jpg,500,333,car,107,145,227,259,0.252,0.72973,0.076,0.096096
2,002244.jpg,500,333,car,144,201,233,266,0.345,0.749249,0.114,0.099099
3,002244.jpg,500,333,car,72,113,231,254,0.185,0.728228,0.082,0.069069
4,002244.jpg,500,333,car,200,274,227,263,0.474,0.735736,0.148,0.108108


In [49]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
16,006148.jpg,375,500,cow,100,208,231,413,0.410667,0.644,0.288,0.364
17,006148.jpg,375,500,person,92,289,122,452,0.508,0.574,0.525333,0.66
18,009947.jpg,332,500,boat,115,250,92,321,0.549699,0.413,0.406627,0.458
19,009947.jpg,332,500,person,165,186,288,312,0.528614,0.6,0.063253,0.048
22,002114.jpg,500,375,person,324,397,159,375,0.721,0.712,0.146,0.576


### Assign id number to object names

In [50]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
       'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
       'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [53]:
import warnings
warnings.filterwarnings("ignore")

train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [54]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,002244.jpg,500,333,car,317,465,215,292,0.782,0.761261,0.296,0.231231,1
1,002244.jpg,500,333,car,107,145,227,259,0.252,0.72973,0.076,0.096096,1
2,002244.jpg,500,333,car,144,201,233,266,0.345,0.749249,0.114,0.099099,1
3,002244.jpg,500,333,car,72,113,231,254,0.185,0.728228,0.082,0.069069,1
4,002244.jpg,500,333,car,200,274,227,263,0.474,0.735736,0.148,0.108108,1
5,002244.jpg,500,333,person,214,236,220,269,0.45,0.734234,0.044,0.147147,0
6,002244.jpg,500,333,person,197,217,217,272,0.414,0.734234,0.04,0.165165,0
7,002244.jpg,500,333,person,479,500,214,319,0.979,0.8003,0.042,0.315315,0
8,002244.jpg,500,333,person,468,484,224,269,0.952,0.74024,0.032,0.135135,0
9,002244.jpg,500,333,person,457,473,223,268,0.93,0.737237,0.032,0.135135,0


### Save Image and Labels in text

In [55]:
import os
from shutil import move

In [56]:
train_folder = './data_images/train'
test_folder = './data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [57]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [59]:
groupby_obj_train.head()

Unnamed: 0,filename,id,center_x,center_y,w,h
0,002244.jpg,1,0.782000,0.761261,0.296000,0.231231
1,002244.jpg,1,0.252000,0.729730,0.076000,0.096096
2,002244.jpg,1,0.345000,0.749249,0.114000,0.099099
3,002244.jpg,1,0.185000,0.728228,0.082000,0.069069
4,002244.jpg,1,0.474000,0.735736,0.148000,0.108108
...,...,...,...,...,...,...
15656,003452.jpg,2,0.747748,0.828000,0.456456,0.344000
15657,003452.jpg,2,0.226727,0.696000,0.087087,0.264000
15658,003452.jpg,2,0.382883,0.776000,0.345345,0.348000
15659,003452.jpg,2,0.388889,0.573000,0.123123,0.078000


In [61]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('./data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [62]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [64]:
# filename_series
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [65]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object