In [1]:
from Bio import Entrez
import csv
import os
import xml.etree.ElementTree as ET
import copy

In [2]:
def load_pubmed_csv_to_dict(input_csv):
    result_dict = {}

    with open(input_csv, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            result_dict[row['Pub Title']] = {
                'Pub Title': row['Pub Title'],
                'Keywords': [kw.strip() for kw in row['Keywords'].split(',')],
                'SQ_TP': [sq.strip() for sq in row['SQ_TP'].split(',')],
                'SQ_FP': [sq.strip() for sq in row['SQ_FP'].split(',')],
                'SQ_R': [sq.strip() for sq in row['SQ_R'].split(',')],
                'PubMed_IDs_TP': [id.strip() for id in row['PubMed_IDs_TP'].split(',')],
                'PubMed_IDs_FP': [id.strip() for id in row['PubMed_IDs_FP'].split(',')],
                'PubMed_IDs_R': [id.strip() for id in row['PubMed_IDs_R'].split(',')]
            }

    return result_dict

In [5]:
def load_pmc_csv_to_dict(input_csv):
    result_dict = {}

    with open(input_csv, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            result_dict[row['Pub Title']] = {
                'Pub Title': row['Pub Title'],
                'Keywords': [kw.strip() for kw in row['Keywords'].split(',')],
                'SQ_TP': [sq.strip() for sq in row['SQ_TP'].split(',')],
                'SQ_FP': [sq.strip() for sq in row['SQ_FP'].split(',')],
                'SQ_R': [sq.strip() for sq in row['SQ_R'].split(',')],
                'PMC_IDs_TP': [id.strip() for id in row['PMC_IDs_TP'].split(',')],
                'PMC_IDs_FP': [id.strip() for id in row['PMC_IDs_FP'].split(',')],
                'PMC_IDs_R': [id.strip() for id in row['PMC_IDs_R'].split(',')]
            }

    return result_dict

In [14]:
# Function that takes dict with list of PMIDs and PMCIDs for the SQs as input, searches specified directory for the corresping XML papers and combines all to one large XML file (output)
def extract_xml_files(input_pmid_dict, input_pmc_dict, input_dir, output_file):
    
    SQ_PMIDs = {
        'SQ_TP_PMIDs': [id for pub_title, data in input_pmid_dict.items() for id in data['PubMed_IDs_TP']],
        'SQ_FP_PMIDs': [id for pub_title, data in input_pmid_dict.items() for id in data['PubMed_IDs_FP']],
        'SQ_R_PMIDs': [id for pub_title, data in input_pmid_dict.items() for id in data['PubMed_IDs_R']]
    }

    SQ_PMCIDs = {
        'SQ_TP_PMCIDs': [id for pub_title, data in input_pmc_dict.items() for id in data['PMC_IDs_TP']],
        'SQ_FP_PMCIDs': [id for pub_title, data in input_pmc_dict.items() for id in data['PMC_IDs_FP']],
        'SQ_R_PMCIDs': [id for pub_title, data in input_pmc_dict.items() for id in data['PMC_IDs_R']]
    }

    with open(output_file, 'wb') as f:
        f.write(b'<root>\n')

        for SQ_PMID, SQ_PMCID in zip(SQ_PMIDs.items(), SQ_PMCIDs.items()):
            SQ_root = ET.Element(f'{SQ_PMID[0]}_{SQ_PMCID[0]}')

            desired_PMID_IDs = SQ_PMID[1]
            desired_PMCID_IDs = SQ_PMCID[1]

            for root_dir, dirs, files in os.walk(input_dir):
                for xml_file in files:
                    if xml_file.endswith('.xml'):
                        xml_file_path = os.path.join(root_dir, xml_file)

                        try:
                            tree = ET.parse(xml_file_path)
                        except ET.ParseError:
                            print(f"Skipping file due to ParseError: {xml_file_path}")
                            continue

                        root = tree.getroot()
                        root_copy = copy.deepcopy(root)

                        for element in root_copy.iter('article-id'):
                            if (
                                (element.attrib.get('pub-id-type') == 'pmid' and element.text in desired_PMID_IDs) or
                                (element.attrib.get('pub-id-type') == 'pmc' and element.text in desired_PMCID_IDs)
                            ):
                                SQ_root.append(root_copy)

            f.write(ET.tostring(SQ_root, encoding='utf-8'))

        f.write(b'</root>')

In [15]:
# Example usage:
input_csv = "PMID_Sabrina.csv"
loaded_pubmed_dict = load_pubmed_csv_to_dict(input_csv)

input_csv = "PMC_Sabrina.csv"
loaded_pmc_dict = load_pmc_csv_to_dict(input_csv)

# extract XML files for SQs (xml input dir can be nested)
xml_input_dir = "All_Articles\\PMC000xxxxxx"
output_file = "SQ_Sabrina.xml"
extract_xml_files(loaded_pubmed_dict, loaded_pmc_dict, xml_input_dir, output_file)