# Step Two: Simplify Table from XML/HTML formats to Customized Table format and Save as JSON Documents

### Simplify Table from Elsevier 

In [1]:
import os
import xml.etree.ElementTree as ET
import json
from bs4 import BeautifulSoup
import shutil

def parse(path):
    file_paths = traverse_folder(path)
    for file_path in file_paths:
        if file_path.endswith('.xml'):
            tree = ET.parse(file_path)
            root = tree.getroot()
            root_str = ET.tostring(root).decode().replace('\n', '')
            soup = BeautifulSoup(root_str, 'xml')
            tables = soup.findAll('table')
            count = 1
            for table in tables:
                try:
                    info = {}
                    title = ''
                    label = table.find('label')
                    if label and label.get_text():
                        title = title + label.get_text() + ": "
                    simple_para = table.find("simple-para")
                    if simple_para and  len(simple_para.get_text()) > 0:
                        title = title + simple_para.get_text() + " "
                    info['title'] = title.replace(' ', '')
                    thead = table.find('thead')
                    rows_list = parse_row(thead)
                    if len(rows_list) > 0:
                        rows_list[0][0] = '<thead>' + rows_list[0][0]
                        rows_last = rows_list[len(rows_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</thead>'

                    tbody = table.find('tbody')
                    tbody_row_list = parse_row(tbody)
                    if len(tbody_row_list) > 0:
                        tbody_row_list[0][0] = '<tbody>' + tbody_row_list[0][0]
                        rows_last = tbody_row_list[len(tbody_row_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</tbody>'
                    rows_list.append(tbody_row_list)
                    info['values'] = rows_list
                    table_name = f'table{count}.json'
                    save(file_path, table_name, info)
                    count += 1
                except Exception:
                    copy_fail_file(file_path)
                    break

def parse_row(tag):
    rows_list = []
    if tag:
        rows = tag.findAll('row')
        for row in rows:
            entry_list = []
            entries = row.findAll('entry')
            for entry in entries:
                entry_str = '<entry'
                text = entry.get_text()
                attrs = entry.attrs
                if attrs:
                    if 'morerows' in attrs:
                        entry_str = entry_str + ' morerows=' + attrs['morerows'].replace('"', '')
                    if 'namest' in attrs:
                        entry_str = entry_str + ' namest=' + attrs['namest'].replace('"', '')
                    if 'nameend' in attrs:
                        entry_str = entry_str + ' nameend=' + attrs['nameend'].replace('"', '')
                    if 'role' in attrs:
                        entry_str = entry_str + ' role=' + attrs['role'].replace('"', '')
                if text:
                    entry_str = entry_str + ":" + text.strip().replace(' ', '')
                entry_str = entry_str + ">"
                entry_list.append(entry_str)
            if len(entry_list) > 0:
                entry_list[0] = '<row>' + entry_list[0]
                entry_list[len(entry_list) - 1] = entry_list[len(entry_list) - 1] + '</row>'
                rows_list.append(entry_list)
    return rows_list

def traverse_folder(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            abs_file_path = os.path.join(root, file_name)
            file_paths.append(abs_file_path)

    return file_paths

def copy_fail_file(file_path: str):
    path = generate_path(file_path, 'Failed-Table') ### Fail2Indentified Tables saved path
    rfind = path.rfind(os.sep)
    path = path[:rfind]
    create_file(path)
    shutil.copy(file_path, path)

def save(file_path: str, table_name: str, info):
    if info:
        path = generate_path(file_path, 'E-Table') ### Table saved path
        create_file(path)
        file_path_ = os.path.join(path, table_name)

        with open(file_path_, 'w', encoding='utf-8') as fp:
            json.dump(info, fp, ensure_ascii=False)

def generate_path(file_path, folder_name):
    index = file_path.rfind(os.sep)
    path_ = file_path[:index]
    index = path_.rfind(os.sep)
    path_ = path_[:index]
    file_name_ = file_path[index + 1:]
    last_dot = file_name_.rfind(".")
    file_name = file_name_[:last_dot]
    rfind = file_name.rfind(os.sep)
    file_name = file_name[rfind + 1:]
    path = os.path.join(path_.strip(), folder_name, file_name.strip())
    return path

def create_file(path):
    if not os.path.exists(path):
        os.makedirs(path)

if __name__ == '__main__':
    path = os.path.join('Data', 'E')  ### File Path and Publisher Folder
    parse(path)


### Simplify Table from Royal Soc Chemistry

In [2]:
import os
from bs4 import BeautifulSoup
import json
import shutil

def parse(path):
    file_paths = traverse_folder(path)
    for file_path in file_paths:
        if file_path.endswith('.html'):
            fp = open(file_path, 'r', encoding='utf-8')
            soup = BeautifulSoup(fp, 'html.parser')
            table_titles = soup.find_all('div', {'class': 'table_caption'})
            tables = soup.find_all('table', {'class': ['tgroup', 'rtable']})
            count = 0
            for table in tables:
                try:
                    info = {}
                    title_info = table_titles[count]
                    title_key = title_info.find('b').get_text()
                    title = title_info.find('span').get_text()
                    info[title_key] = title
                    thead = table.find('thead')
                    rows_list = parse_table(thead, 'th')
                    if len(rows_list) > 0:
                        rows_list[0][0] = '<thead>' + rows_list[0][0]
                        rows_last = rows_list[len(rows_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</thead>'

                    thead = table.find('tfoot')
                    tfoot_rows_list = parse_table(thead, 'th')
                    if len(tfoot_rows_list) > 0:
                        tfoot_rows_list[0][0] = '<tfoot>' + tfoot_rows_list[0][0]
                        rows_last = tfoot_rows_list[len(tfoot_rows_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</tfoot>'
                        rows_list.append(tfoot_rows_list)
                    tbody = table.find('tbody')
                    tbody_row_list = parse_table(tbody, 'td')
                    if len(tbody_row_list) > 0:
                        tbody_row_list[0][0] = '<tbody>' + tbody_row_list[0][0]
                        rows_last = tbody_row_list[len(tbody_row_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</tbody>'
                        rows_list.append(tbody_row_list)
                    info['values'] = rows_list
                    table_name = f'table{count + 1}.json'
                    save(file_path, table_name, info)
                    count += 1
                except Exception:
                    copy_fail_file(file_path)
                    break


def parse_table(tag, chilld):
    tr_list = []
    if tag:
        trs = tag.findAll('tr')
        for row in trs:
            th_list = []
            ths = row.findAll(chilld)
            for th in ths:
                th_context = f'<{chilld}'
                text = th.get_text()
                attrs = th.attrs
                if attrs:
                    if 'rowspan' in attrs:
                        th_context = th_context + ' rowspan=' + attrs['rowspan'].replace('"', '')
                    if 'colspan' in attrs:
                        th_context = th_context + ' colspan=' + attrs['colspan'].replace('"', '')
                if text:
                    th_context = th_context + ":" + text
                th_context = th_context + f">"
                th_list.append(th_context)
            if len(th_list) > 0:
                th_list[0] = '<tr>' + th_list[0]
                th_list[len(th_list) - 1] = th_list[len(th_list) - 1] + '</tr>'
                tr_list.append(th_list)
    return tr_list


def traverse_folder(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            abs_file_path = os.path.join(root, file_name)
            file_paths.append(abs_file_path)

    return file_paths


def copy_fail_file(file_path:str):
    path = generate_path(file_path, 'Failed-Table') ### Fail2Indentified Tables saved path
    rfind = path.rfind(os.sep)
    path = path[:rfind]
    create_file(path)
    shutil.copy(file_path, path)


def save(file_path: str, table_name: str, info):
    if info:
        path = generate_path(file_path,'RSC-Table') ### Table saved path
        create_file(path)
        file_path_ = os.path.join(path, table_name)

        with open(file_path_, 'w', encoding='utf-8') as fp:
            json.dump(info, fp, ensure_ascii=False)


def generate_path(file_path,folder_name):
    index = file_path.rfind(os.sep)
    path_ = file_path[:index]
    index = path_.rfind(os.sep)
    path_ = path_[:index]
    file_name_ = file_path[index + 1:]
    last_dot = file_name_.rfind(".")
    file_name = file_name_[:last_dot]
    rfind = file_name.rfind(os.sep)
    file_name = file_name[rfind + 1:]
    path = os.path.join(path_.strip(), folder_name, file_name.strip())
    return path


def create_file(path):
    if not os.path.exists(path):
        os.makedirs(path)


if __name__ == '__main__':
    path = os.path.join('Data', 'RSC') ### File Path and Publisher Folder
    parse(path)

### Simplify Table from Springer Nature

In [3]:
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
import shutil

def parse(path):
    file_paths = traverse_folder(path)
    for file_path in file_paths:
        if file_path.endswith('.xml'):
            tree = ET.parse(file_path)
            root = tree.getroot()
            root_str = ET.tostring(root).decode().replace('\n', '')
            soup = BeautifulSoup(root_str, 'xml')
            tables = soup.findAll('table-wrap')
            count = 1
            for table in tables:
                try:
                    info = {}
                    label = table.find('label').get_text()
                    title = table.find("p").get_text().replace(' ', '')
                    info[label] = title
                    thead = table.find('thead')
                    rows_list = parse_table(thead, 'th')
                    if len(rows_list) > 0:
                        rows_list[0][0] = '<thead>' + rows_list[0][0]
                        rows_last = rows_list[len(rows_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</thead>'

                    tbody = table.find('tbody')
                    tbody_row_list = parse_table(tbody, 'td')
                    if len(tbody_row_list) > 0:
                        tbody_row_list[0][0] = '<tbody>' + tbody_row_list[0][0]
                        rows_last = tbody_row_list[len(tbody_row_list) - 1]
                        rows_last[len(rows_last) - 1] = rows_last[len(rows_last) - 1] + '</tbody>'
                    rows_list.append(tbody_row_list)
                    info['values'] = rows_list
                    table_name = f'table{count}.json'
                    save(file_path, table_name,info)
                    count += 1
                except Exception:
                    copy_fail_file(file_path)
                    break


def parse_table(tag, chilld):
    tr_list = []
    if tag:
        trs = tag.findAll('tr')
        for row in trs:
            th_list = []
            ths = row.findAll(chilld)
            for th in ths:
                th_context = f'<{chilld}'
                text = th.get_text()
                attrs = th.attrs
                if attrs:
                    if 'rowspan' in attrs:
                        th_context = th_context + ' rowspan=' + attrs['rowspan'].replace('"', '')
                    if 'colspan' in attrs:
                        th_context = th_context + ' colspan=' + attrs['colspan'].replace('"', '')
                if text:
                    th_context = th_context + ":" + text.strip()
                th_context = th_context + f">"
                th_list.append(th_context)
            if len(th_list) > 0:
                th_list[0] = '<tr>' + th_list[0]
                th_list[len(th_list) - 1] = th_list[len(th_list) - 1] + '</tr>'
                tr_list.append(th_list)
    return tr_list


def traverse_folder(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            abs_file_path = os.path.join(root, file_name)
            file_paths.append(abs_file_path)

    return file_paths


def copy_fail_file(file_path:str):
    path = generate_path(file_path, 'Failed-Table') ### Fail2Indentified Tables saved path
    rfind = path.rfind(os.sep)
    path = path[:rfind]
    create_file(path)
    shutil.copy(file_path, path)


def save(file_path: str, table_name: str, info):
    if info:
        path = generate_path(file_path,'SN-Table') ### Table saved path
        create_file(path)
        file_path_ = os.path.join(path, table_name)

        with open(file_path_, 'w', encoding='utf-8') as fp:
            json.dump(info, fp, ensure_ascii=False)


def generate_path(file_path,folder_name):
    index = file_path.rfind(os.sep)
    path_ = file_path[:index]
    index = path_.rfind(os.sep)
    path_ = path_[:index]
    file_name_ = file_path[index + 1:]
    last_dot = file_name_.rfind(".")
    file_name = file_name_[:last_dot]
    rfind = file_name.rfind(os.sep)
    file_name = file_name[rfind + 1:]
    path = os.path.join(path_.strip(), folder_name, file_name.strip())
    return path


def create_file(path):
    if not os.path.exists(path):
        os.makedirs(path)


if __name__ == '__main__':
    path = os.path.join('Data', 'SN') ### File Path and Publisher Folder
    parse(path)

### Special XML Table Format Without Thead in SpringerNature

In [None]:
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
import shutil

def parse(path):
    file_paths = traverse_folder(path)
    for file_path in file_paths:
        if file_path.endswith('.xml'):
            tree = ET.parse(file_path)
            root = tree.getroot()
            root_str = ET.tostring(root).decode().replace('\n', '')
            soup = BeautifulSoup(root_str, 'xml')
            tables = soup.findAll('table-wrap')
            count = 1
            for table in tables:
                try:
                    info = {}
                    label = table.find('label').get_text()
                    title = table.find("p").get_text().replace(' ', '')
                    info[label] = title

                    # 如果 <thead> 存在，保留它，否则跳过处理
                    thead_tag = table.find('thead')
                    if thead_tag:
                        thead = str(thead_tag)
                        info['thead'] = thead
                    else:
                        info['thead'] = None

                    # 处理 <tbody> 内容
                    tbody = table.find('tbody')
                    tbody_row_list = parse_table(tbody, 'td')
                    if tbody_row_list:
                        tbody_row_list[0][0] = '<tbody>' + tbody_row_list[0][0]
                        tbody_row_list[-1][-1] = tbody_row_list[-1][-1] + '</tbody>'

                    info['tbody'] = tbody_row_list
                    table_name = f'table{count}.json'
                    save(file_path, table_name, info)
                    count += 1
                except Exception:
                    copy_fail_file(file_path)
                    break

def parse_table(tag, child):
    tr_list = []
    if tag:
        trs = tag.findAll('tr')
        for row in trs:
            td_list = []
            tds = row.findAll(child)
            for td in tds:
                td_context = f'<{child}'
                text = td.get_text()
                attrs = td.attrs
                if attrs:
                    if 'rowspan' in attrs:
                        td_context += f' rowspan={attrs["rowspan"]}'
                    if 'colspan' in attrs:
                        td_context += f' colspan={attrs["colspan"]}'
                if text:
                    td_context += f':{text.strip()}'
                td_context += f'>'
                td_list.append(td_context)
            if td_list:
                td_list[0] = '<tr>' + td_list[0]
                td_list[-1] = td_list[-1] + '</tr>'
                tr_list.append(td_list)
    return tr_list

def traverse_folder(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            abs_file_path = os.path.join(root, file_name)
            file_paths.append(abs_file_path)
    return file_paths

def copy_fail_file(file_path: str):
    path = generate_path(file_path, 'Failed-Table')  ### Fail2Indentified Tables saved path
    os.makedirs(os.path.dirname(path), exist_ok=True)
    shutil.copy(file_path, path)

def save(file_path: str, table_name: str, info):
    if info:
        path = generate_path(file_path, 'SN-Table') ### Table saved path
        os.makedirs(path, exist_ok=True)
        file_path_ = os.path.join(path, table_name)

        with open(file_path_, 'w', encoding='utf-8') as fp:
            json.dump(info, fp, ensure_ascii=False)

def generate_path(file_path, folder_name):
    path_ = os.path.dirname(os.path.dirname(file_path))
    file_name_ = os.path.splitext(os.path.basename(file_path))[0]
    return os.path.join(path_, folder_name, file_name_)

if __name__ == '__main__':
    path = 'Data/SN' ### File Path 
    parse(path)