# Step One: Identify Papers from different Publishers that contain Tables

## From XML and HTML formats of Papers 

### Identify Papers from Elsevier that contain Tables

In [2]:
import os
import shutil
from bs4 import BeautifulSoup


def is_has_tables(fileName, file_extension):
    """Check if the file contains a <ce: table> tag"""    
    with open(fileName, 'r', encoding='utf-8') as f:
        content = f.read()
    if file_extension == '.html':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'xml')
    if soup.find('ce:table') is not None:
        return True
    else:
        return False

def check_files(directory):
    """Move"""
    for i in [has_table_directory, check_file_failure_directory]:
        if not os.path.exists(i):
            os.makedirs(i)
    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)
        if os.path.isfile(full_path):
            file_extension = os.path.splitext(full_path)[1]
            pjPath = '\\'.join(full_path.split('\\')[1:-1])
            success_path = os.path.join(has_table_directory, pjPath)
            failure_path = os.path.join(check_file_failure_directory, pjPath)
            if file_extension in ('.xml', '.html'):
                is_has_table = is_has_tables(full_path, file_extension)
                if is_has_table:
                    if not os.path.exists(success_path):
                        os.makedirs(success_path)
                    shutil.move(full_path, success_path)
                    print('move <%s> to <%s>'%(full_path, success_path))
            else:
                if not os.path.exists(failure_path):
                    os.makedirs(failure_path)
                shutil.move(full_path, failure_path)
                print('move <%s> to <%s>'%(full_path, failure_path))
        elif os.path.isdir(full_path):
            check_files(full_path)
    return has_table_directory

def tableAuto_main(source_folder):
    check_files(source_folder)

if __name__ == '__main__':
    root_path = 'Data' ### All papers saved path
    source_folder = root_path + '/...' ### Publisher: Elsevier
    has_table_directory = root_path + '/...' ### PaperContainingTable saved path
    check_file_failure_directory = root_path + '/...' ### Fail2Indentified papers saved path
    tableAuto_main(source_folder)

move </Users/zixuanzhao/Desktop/3/E/10.1016-j.jmrt.2024.03.209 .xml> to </Users/zixuanzhao/Desktop/3/E-PCT/>


### Identify Papers from Royal Soc Chemistry that contain Tables 

In [2]:
import os
import shutil
from bs4 import BeautifulSoup

def is_has_tables(fileName, file_extension):
    """Check if the file contains a <table> tag"""
    with open(fileName, 'r', encoding='utf-8') as f:
        content = f.read()
    if file_extension == '.html':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'xml')
    if soup.find('table') is not None:
        return True
    else:
        return False

def check_files(directory, has_table_directory, check_file_failure_directory):
    """Move files based on the presence of a <table> tag"""
    for i in [has_table_directory, check_file_failure_directory]:
        if not os.path.exists(i):
            os.makedirs(i)
            
    for entry in os.listdir(directory):

        full_path = os.path.join(directory, entry)

        if os.path.isfile(full_path):
            file_extension = os.path.splitext(full_path)[1]
            pjPath = os.path.relpath(full_path, directory)
            success_path = os.path.join(has_table_directory, pjPath)
            failure_path = os.path.join(check_file_failure_directory, pjPath)
            if file_extension in ('.xml', '.html'):
                is_has_table = is_has_tables(full_path, file_extension)
                if is_has_table:
                    if not os.path.exists(os.path.dirname(success_path)):
                        os.makedirs(os.path.dirname(success_path))
                    shutil.move(full_path, success_path)
                    print('Moved <%s> to <%s>' % (full_path, success_path))
                else:
                    if not os.path.exists(os.path.dirname(failure_path)):
                        os.makedirs(os.path.dirname(failure_path))
                    shutil.move(full_path, failure_path)
                    print('Moved <%s> to <%s>' % (full_path, failure_path))

        elif os.path.isdir(full_path):
            check_files(full_path, has_table_directory, check_file_failure_directory)

def tableAuto_main(source_folder, has_table_directory, check_file_failure_directory):
    check_files(source_folder, has_table_directory, check_file_failure_directory)

if __name__ == '__main__':
    root_path = 'Data'### All papers saved path
    source_folder = os.path.join(root_path, 'RSC') ### Publisher: Royal Soc Chemistry
    has_table_directory = os.path.join(root_path, 'RSC-PCT') ### PaperContainingTable saved path
    check_file_failure_directory = os.path.join(root_path, 'RSC-Fail2Indentification') ### Fail2Indentified papers saved path
    tableAuto_main(source_folder, has_table_directory, check_file_failure_directory)

Moved <Data/Paper/RSC/10.1039-b811775f.html> to <Data/Paper/RSC-PCT/10.1039-b811775f.html>


### Identify Papers from Springer Nature that contain Tables

In [3]:
import os
import shutil
from bs4 import BeautifulSoup

def is_has_tables(fileName, file_extension):
    """Check if the file contains a <table-wrap> tag"""
    with open(fileName, 'r', encoding='utf-8') as f:
        content = f.read()
    if file_extension == '.html':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'xml')
    if soup.find('table-wrap') is not None:
        return True
    else:
        return False

def check_files(directory, has_table_directory, check_file_failure_directory):
    """Move files based on the presence of a <table-wrap> tag"""
    for i in [has_table_directory, check_file_failure_directory]:
        if not os.path.exists(i):
            os.makedirs(i)
    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)
        if os.path.isfile(full_path):
            file_extension = os.path.splitext(full_path)[1]
            pjPath = os.path.relpath(full_path, directory)
            success_path = os.path.join(has_table_directory, pjPath)
            failure_path = os.path.join(check_file_failure_directory, pjPath)
            if file_extension in ('.xml', '.html'):
                is_has_table = is_has_tables(full_path, file_extension)
                if is_has_table:
                    if not os.path.exists(os.path.dirname(success_path)):
                        os.makedirs(os.path.dirname(success_path))
                    shutil.move(full_path, success_path)
                    print('Moved <%s> to <%s>' % (full_path, success_path))
                else:
                    if not os.path.exists(os.path.dirname(failure_path)):
                        os.makedirs(os.path.dirname(failure_path))
                    shutil.move(full_path, failure_path)
                    print('Moved <%s> to <%s>' % (full_path, failure_path))
        elif os.path.isdir(full_path):
            check_files(full_path, has_table_directory, check_file_failure_directory)

def tableAuto_main(source_folder, has_table_directory, check_file_failure_directory):
    check_files(source_folder, has_table_directory, check_file_failure_directory)

if __name__ == '__main__':
    root_path = 'Data' ### All papers saved path
    source_folder = os.path.join(root_path, 'SN') ### Publisher: Springer Nature
    has_table_directory = os.path.join(root_path, 'SN-PCT') ### PaperContainingTable saved path
    check_file_failure_directory = os.path.join(root_path, 'SN-Fail2Indentification') ### Fail2Indentified papers saved path
    tableAuto_main(source_folder, has_table_directory, check_file_failure_directory)


Moved <Data/Paper/SN/10.1007-s00170-019-04167-2.xml> to <Data/Paper/SN-PCT/10.1007-s00170-019-04167-2.xml>
