In [1]:
import xml.etree.ElementTree as ET
import tarfile
import os
import shutil

In [2]:
def decompress_tar_gz(tar_gz_file, extraction_path):
    with tarfile.open(tar_gz_file, 'r:gz') as tar:
        tar.extractall(path=extraction_path)

In [8]:
# Example usage
tar_gz_file_path = 'C:\\Users\\sabri\\Downloads\\oa_comm_xml.PMC010xxxxxx.baseline.2023-12-18.tar.gz'

# Create an "Extracted_XML" folder in the working directory
# output_folder = os.path.join(os.getcwd(), 'Extracted_XML')
# os.makedirs(output_folder, exist_ok=True)

# Decompress the tar.gz file
extraction_path = os.path.join(os.getcwd(), 'All_Articles')
decompress_tar_gz(tar_gz_file_path, extraction_path)

KeyboardInterrupt: 

### Filter by using PMC

In [57]:
# Specify the path to the folder containing XML files
xml_folder_path = 'All_Articles'

# Specify the desired article IDs
desired_article_ids = ['PMC176546', 'PMC176545', 'PMC176548']

# Specify the output folder
output_folder = 'Extracted_XML'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over XML files in the folder
for xml_file in os.listdir(xml_folder_path):
    # Extract the basename (file name without extension)
    file_basename = os.path.splitext(xml_file)[0]

    # Check if the basename matches any of the desired article IDs
    if file_basename in desired_article_ids:
        # Build the paths for the source and destination
        source_path = os.path.join(xml_folder_path, xml_file)
        destination_path = os.path.join(output_folder, xml_file)

        # Copy the matching XML file to the output folder
        shutil.copy2(source_path, destination_path)

        print(f"Matching XML file '{xml_file}' copied to '{output_folder}'.")

Matching XML file 'PMC176545.xml' copied to 'Extracted_XML'.
Matching XML file 'PMC176546.xml' copied to 'Extracted_XML'.
Matching XML file 'PMC176548.xml' copied to 'Extracted_XML'.


### Filter by using PubMed ID instead of PMC

In [5]:
# Specify the path to the folder containing individual XML files
input_folder_path = 'All_Articles\PMC000xxxxxx'

# Specify the list of desired pmids
desired_pmids = [32111634, 32615090, 32859898, 32913197, 33259205, 32619440, 32823573, 32205883, 33382950, 33319778, 
                 33443161, 32703946, 31628411, 31857433, 30852164, 31519811, 30673913, 31578309, 31681563, 34946037, 
                 37938227, 34668583, 34944797, 34658896, 34739342, 33949105, 34001868, 34739338, 34857933, 33785070, 
                 33646117, 34694375, 33075129, 32776003, 32376096, 33240188, 31998276, 33260635, 32376073, 34694375, 
                 36332011, 33627512, 34039428, 34836426, 34806775, 34580292, 34580292, 34000281, 34282029, 34312385, 
                 33550886, 34777363, 33925452, 34885050, 37073345, 34339295, 34111652, 34021796, 34099917, 34847376, 
                 35475000, 34105518, 33137688, 34016512, 32622559, 34680258, 34616012, 34081399, 34158656, 33662870, 
                 34442780, 34646011, 34093985, 34043764, 34467251, 34767456, 33469015, 34115827, 33733474, 34946037, 
                 37938227, 34668583, 34944797, 34658896, 34739342, 33949105, 34001868, 34739338, 34857933, 33785070, 
                 33646117, 34694375, 33075129, 32776003, 32376096, 33240188, 31998276, 33260635, 32376073, 36771343]

# Initialize an empty list to store selected articles
selected_articles = []

# Recursively iterate over XML files in the root and its subdirectories
for root, dirs, files in os.walk(input_root_path):
    for xml_file in files:
        if xml_file.endswith('.xml'):
            xml_file_path = os.path.join(root, xml_file)

            # Parse the content of each XML file
            tree = ET.parse(xml_file_path)

            # Check if the PMID is in the list of desired pmids
            pmid_element = tree.find('.//article-id[@pub-id-type="pmid"]')
            if pmid_element is not None and pmid_element.text in desired_pmids:
                # Append the root of the XML document to the selected articles root
                combined_root.extend(tree.getroot())

# Create a new ElementTree with the selected articles as the root
combined_tree = ET.ElementTree(ET.Element('CombinedRoot'))

# Append selected articles to the new root element
for article in selected_articles:
    combined_tree.getroot().append(article)

# Specify the output path for the selected articles XML file
output_file_path = 'Extracted_Papers/selected_articles.xml'

# Write the selected articles to the output file
combined_tree.write(output_file_path, encoding='utf-8', xml_declaration=True)

print(f"Selected articles saved to '{output_file_path}'.")

Selected articles saved to 'Extracted_Papers/selected_articles.xml'.
