In [None]:
!pip install xmltodict

In [None]:
import os
import shutil
import xmltodict
from tqdm import tqdm

In [None]:
voc_base_dir = 'FinTabNet.c/FinTabNet.c-Structure'
coco_base_dir = 'FinTabNet.c/FinTabNet.c-Structure-COCO'

In [None]:
!mkdir -p $coco_base_dir
!mkdir -p $coco_base_dir/images/train
!mkdir -p $coco_base_dir/images/val
!mkdir -p $coco_base_dir/images/test
!mkdir -p $coco_base_dir/labels/train
!mkdir -p $coco_base_dir/labels/val
!mkdir -p $coco_base_dir/labels/test

In [None]:
import multiprocessing
from joblib import Parallel, delayed

max_processes = multiprocessing.cpu_count()
print('max_processes:', max_processes)

In [None]:
class_names_map = {'table':0, 'table column':1, 'table row':2, 'table column header':3, 'table projected row header':4, 'table spanning cell':5, 'table grid cell':6}

def voc2coco_single(xml_label_filename):
    xml_label_filename = xml_label_filename.strip()
    if xml_label_filename.endswith('xml'):
        file_object = open(os.path.join(voc_base_dir, xml_label_filename), encoding='utf-8')                                                                                                            
        try:
            all_the_xmlStr = file_object.read()
        finally:
            file_object.close()
        convertedDict = xmltodict.parse(all_the_xmlStr)
        # print(convertedDict)
        # print(len(convertedDict['annotation']['object']))
        fix_width = int(convertedDict['annotation']['size']['width'])
        fix_height = int(convertedDict['annotation']['size']['height'])
        if 'object' in convertedDict['annotation']:
            objs = convertedDict['annotation']['object']
            if not isinstance(objs,list):
                objs = [objs]
#                 print('objs:', objs)
            with open(os.path.join(coco_base_dir, 'labels', xml_label_filename[:-4]+'.txt'), 'w') as fout:
                for annotation in objs:
                    if annotation['name'] not in class_names_map:
                        class_names_map[annotation['name']] = len(class_names_map)
                    class_id = class_names_map[annotation['name']]
                    # class_id = 0

                    xmin = int(float(annotation['bndbox']['xmin']))
                    ymin = int(float(annotation['bndbox']['ymin']))
                    xmax = int(float(annotation['bndbox']['xmax']))
                    ymax = int(float(annotation['bndbox']['ymax']))

                    w = xmax-xmin
                    h = ymax-ymin

                    if w>0 and h>0:
                        center_x = (xmin+xmax)/2
                        center_y = (ymin+ymax)/2
                        fout.write(str(class_id)+' '+str(center_x/fix_width)+' '+str(center_y/fix_height)+' '+str(w/fix_width)+' '+str(h/fix_height)+'\n')
    else:
        print('[BUG] xml_label_filename:', xml_label_filename)

def voc2coco(split='train'):
    xml_label_filenames = os.listdir(os.path.join(voc_base_dir, split))
    for i in range(len(xml_label_filenames)):
        xml_label_filenames[i] = split+'/'+xml_label_filenames[i]
    print(split, len(xml_label_filenames))
    Parallel(n_jobs=max_processes)(delayed(voc2coco_single)(xml_label_filename) for xml_label_filename in tqdm(xml_label_filenames))        

In [None]:
voc2coco('train')
voc2coco('val')
voc2coco('test')

print('class_names_map:', class_names_map)

In [None]:
def copy_images(split='train'):
    xml_label_filenames = os.listdir(os.path.join(voc_base_dir, split))
    for i in range(len(xml_label_filenames)):
        xml_label_filenames[i] = split+'/'+xml_label_filenames[i]
    print(split, len(xml_label_filenames))
    for xml_label_filename in tqdm(xml_label_filenames):
        xml_label_filename = xml_label_filename.strip()
        if xml_label_filename.endswith('xml'):
            jpg_filename = os.path.join(voc_base_dir, 'images', xml_label_filename.split('/')[1].replace('xml', 'jpg'))
            new_jpg_filename = os.path.join(coco_base_dir, 'images', xml_label_filename.replace('xml', 'jpg'))
            # print(jpg_filename, new_jpg_filename)
            try:
                # shutil.copy(jpg_filename, new_jpg_filename)
                shutil.move(jpg_filename, new_jpg_filename)
                # break
            except:
                continue

In [None]:
copy_images('train')
copy_images('val')
copy_images('test')