In [111]:
import json, os, time
import pandas as pd
import numpy as np

org_label_root = '/home/lstm/github/PaddleDetection/dataset/_CDLA_DATASET/coco'
new_dataset_root = f'/home/lstm/github/PaddleDetection/dataset/_KRCASE_{time.strftime("%Y%m%d")}/train'

In [68]:
# load CDLA dataset labels
org_label_path = f'{org_label_root}/val_.json'
with open(org_label_path, 'r') as f:
    org_annot = json.load(f)

org_label = org_annot['categories']
org_label

[{'supercategory': None, 'id': 1, 'name': 'Text'},
 {'supercategory': None, 'id': 2, 'name': 'Title'},
 {'supercategory': None, 'id': 3, 'name': 'Figure'},
 {'supercategory': None, 'id': 4, 'name': 'Figure caption'},
 {'supercategory': None, 'id': 5, 'name': 'Table'},
 {'supercategory': None, 'id': 6, 'name': 'Table caption'},
 {'supercategory': None, 'id': 7, 'name': 'Header'},
 {'supercategory': None, 'id': 8, 'name': 'Footer'},
 {'supercategory': None, 'id': 9, 'name': 'Reference'},
 {'supercategory': None, 'id': 10, 'name': 'Equation'}]

In [69]:
# load COCO annotation
annot_path = '/home/lstm/Downloads/project-4-at-2024-05-15-14-08-dbe9c78b/result.json'
with open(annot_path, 'r') as f:
    annot = json.load(f)

# display label definition
new_label = annot['categories']
new_label

[{'id': 0, 'name': 'Equation'},
 {'id': 1, 'name': 'Figure'},
 {'id': 2, 'name': 'Figure caption'},
 {'id': 3, 'name': 'Footer'},
 {'id': 4, 'name': 'Header'},
 {'id': 5, 'name': 'Reference'},
 {'id': 6, 'name': 'Table'},
 {'id': 7, 'name': 'Table caption'},
 {'id': 8, 'name': 'Text'},
 {'id': 9, 'name': 'Title'},
 {'id': 10, 'name': 'block_quote'},
 {'id': 11, 'name': 'box_quote'},
 {'id': 12, 'name': 'caption_block'},
 {'id': 13, 'name': 'claim'},
 {'id': 14, 'name': 'fn_marker'},
 {'id': 15, 'name': 'header_image'},
 {'id': 16, 'name': 'list_item'},
 {'id': 17, 'name': 'section_header'},
 {'id': 18, 'name': 'sig_block'},
 {'id': 19, 'name': 'header_text'}]

In [70]:
# fix annot to conform to org_label
df_org = pd.DataFrame(org_label)
df_org


Unnamed: 0,supercategory,id,name
0,,1,Text
1,,2,Title
2,,3,Figure
3,,4,Figure caption
4,,5,Table
5,,6,Table caption
6,,7,Header
7,,8,Footer
8,,9,Reference
9,,10,Equation


In [71]:
df_new = pd.DataFrame(new_label)
df_new

Unnamed: 0,id,name
0,0,Equation
1,1,Figure
2,2,Figure caption
3,3,Footer
4,4,Header
5,5,Reference
6,6,Table
7,7,Table caption
8,8,Text
9,9,Title


In [None]:
df_mod = df_new.copy()
df_mod['org_id'] = np.nan

for idx, row in df_mod.iterrows():   
    print(row['name']) 
    print(df_org.loc[df_org['name']==row['name'], 'id'])
    print(type(df_org.loc[df_org['name']==row['name'], 'id']))
    try:
        # print(df_org.loc[df_org['name']==row['name'], 'id'].iloc[0])
        df_mod.at[idx, 'org_id'] = df_org.loc[df_org['name']==row['name'], 'id'].iloc[0]
    except:
        df_mod.at[idx, 'org_id'] = df_mod['org_id'].max() + 1
df_mod['org_id'] = df_mod['org_id'].astype(int)
df_mod = df_mod.sort_values('org_id').reset_index(drop=True).drop(columns=['id'])

In [89]:
# assert congruence between df_mod and df_org
df_mod_org = df_mod[:len(df_org)]

assert df_mod_org['name'].equals(df_org['name']), "name column is not congruent"
assert df_mod_org['org_id'].equals(df_org['id']), "id column is not congruent"

# display df_org and df_mod side by side
from IPython.display import display_html

# Convert the DataFrames to HTML
html1 = df_mod.to_html()
html2 = df_org.to_html()

# Create an HTML string to display the DataFrames side by side
html = f"""
<div style="display:flex;">
<div style="flex:1;">{html1}</div>
<div style="flex:1;">{html2}</div>
</div>
"""

# Display the HTML
display_html(html, raw=True)

Unnamed: 0,name,org_id
0,Text,1
1,Title,2
2,Figure,3
3,Figure caption,4
4,Table,5
5,Table caption,6
6,Header,7
7,Footer,8
8,Reference,9
9,Equation,10

Unnamed: 0,supercategory,id,name
0,,1,Text
1,,2,Title
2,,3,Figure
3,,4,Figure caption
4,,5,Table
5,,6,Table caption
6,,7,Header
7,,8,Footer
8,,9,Reference
9,,10,Equation


In [90]:
# rename column of df_mod 'org_id' to 'id'
df_mod.rename(columns={'org_id': 'id'}, inplace=True)

# make 'id' column the index
df_mod.set_index('id', inplace=True)

# convert df_mod to json
dict_mod = df_mod.to_dict()


In [99]:
# reconstruct category dict
cat_list = [{'id': i, 'name': v} for i, v in dict_mod['name'].items()]
cat_list

[{'id': 1, 'name': 'Text'},
 {'id': 2, 'name': 'Title'},
 {'id': 3, 'name': 'Figure'},
 {'id': 4, 'name': 'Figure caption'},
 {'id': 5, 'name': 'Table'},
 {'id': 6, 'name': 'Table caption'},
 {'id': 7, 'name': 'Header'},
 {'id': 8, 'name': 'Footer'},
 {'id': 9, 'name': 'Reference'},
 {'id': 10, 'name': 'Equation'},
 {'id': 11, 'name': 'block_quote'},
 {'id': 12, 'name': 'box_quote'},
 {'id': 13, 'name': 'caption_block'},
 {'id': 14, 'name': 'claim'},
 {'id': 15, 'name': 'fn_marker'},
 {'id': 16, 'name': 'header_image'},
 {'id': 17, 'name': 'list_item'},
 {'id': 18, 'name': 'section_header'},
 {'id': 19, 'name': 'sig_block'},
 {'id': 20, 'name': 'header_text'}]

In [110]:
# replace annot['categories'] with cat_list
annot['categories'] = cat_list

# save cat_list to json in original dataset folder
with open(f'{org_label_root}/new_labels.json', 'w') as f:
    json.dump(cat_list, f)

# save annot to json in new dataset folder
if not os.path.exists(f'{new_dataset_root}'):
    os.makedirs(f'{new_dataset_root}')
    
with open(f'{new_dataset_root}/labels.json', 'w') as f:
    json.dump(annot, f)

In [109]:
import os

In [106]:
new_dataset_root

'/home/lstm/github/PaddleDetection/dataset/_KRCASE_20240516'