In [7]:
import os
import csv
from lxml import etree as ET

In [8]:
def extract_metadata_from_xml(xml_path):
    # Initialize the XMLParser with recover option
    parser = ET.XMLParser(recover=True)
    tree = ET.parse(xml_path, parser)
    root = tree.getroot()

    # Safely extracting title
    title_element = root.find(".//title-group/article-title")
    title = title_element.text.strip() if title_element is not None and title_element.text else ""

    # Safely extracting abstract, including all sections
    abstract_elements = root.findall(".//abstract//p")
    abstract = " ".join([elem.text.strip() for elem in abstract_elements if elem is not None and elem.text])

    # Safely extracting PMID
    pmid_element = root.find(".//article-id[@pub-id-type='pmid']")
    pmid = pmid_element.text.strip() if pmid_element is not None and pmid_element.text else ""

    # Safely extracting publication date
    pub_date_element = root.find(".//pub-date/year")
    pub_date = pub_date_element.text.strip() if pub_date_element is not None and pub_date_element.text else ""

    return {
        'Title': title,
        'Abstract': abstract,
        'PMID': pmid,
        'PublicationDate': pub_date
    }


In [13]:
def extract_metadata_from_directory(xml_directory):
    # Prepare the CSV file for writing
    csv_file_path = "/Users/zeynepkorkmaz/Downloads/metadata.csv"
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Abstract", "PMID", "PublicationDate"])
        writer.writeheader()

        # Loop through each XML file in the directory
        for filename in os.listdir(xml_directory):
            if filename.endswith('.xml'):
                xml_path = os.path.join(xml_directory, filename)
                metadata = extract_metadata_from_xml(xml_path)
                writer.writerow(metadata)

In [10]:
xml_path_all = "/Users/zeynepkorkmaz/Downloads/separate_XML"

In [20]:
xml_path_single = "/Users/zeynepkorkmaz/Downloads/separate_XML/PMC176545.xml"

In [18]:
extract_metadata_from_directory(xml_path_all)

In [33]:
extract_metadata_from_xml_v2(xml_path_single)

{'Title': 'The Transcriptome of the Intraerythrocytic Developmental Cycle of',
 'Abstract': ' A tight cascade of gene regulation during the lifecycle of the malaria parasite in human blood cells suggests new functions for many',
 'PMID': '12929205',
 'PublicationDate': '2003'}