In [1]:
import os
import csv
from lxml import etree as ET

### This function takes an XML file as input, parses it for "Title, abstract, PMID, PublicationDate" and returns these values in a dictionary.

In [3]:
def extract_metadata_from_xml(xml_path):
    # Initialize the XMLParser with recover option
    parser = ET.XMLParser(recover=True)
    tree = ET.parse(xml_path, parser)
    root = tree.getroot()

    # Safely extracting title with all child elements
    title_element = root.find(".//title-group/article-title")
    title = ''.join(title_element.itertext()).strip() if title_element is not None else ""

    # Safely extracting abstract, including all sections and child elements
    abstract_elements = root.findall(".//abstract//p")
    abstract = " ".join([''.join(elem.itertext()).strip() for elem in abstract_elements if elem is not None])

    # Safely extracting PMID
    pmid_element = root.find(".//article-id[@pub-id-type='pmid']")
    pmid = pmid_element.text.strip() if pmid_element is not None and pmid_element.text else ""

    # Safely extracting publication date
    pub_date_element = root.find(".//pub-date/year")
    pub_date = pub_date_element.text.strip() if pub_date_element is not None and pub_date_element.text else ""

    return {
        'Title': title,
        'Abstract': abstract,
        'PMID': pmid,
        'PublicationDate': pub_date
    }


### This goes through a directory of XML files, runs the previous function for each XML and saves the results in a CSV file.

In [9]:
def extract_metadata_from_directory(xml_directory):
    # CHANGE THE OUTPUT FILE PATH HERE
    csv_file_path = "/Users/zeynepkorkmaz/Downloads/metadata.csv"
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Abstract", "PMID", "PublicationDate"])
        writer.writeheader()

        # Loop through each XML file in the directory
        for filename in os.listdir(xml_directory):
            if filename.endswith('.xml'):
                xml_path = os.path.join(xml_directory, filename)
                metadata = extract_metadata_from_xml(xml_path)
                writer.writerow(metadata)

### Specify the directory to the XML files

In [12]:
xml_path_all = "/Users/zeynepkorkmaz/Downloads/All_Articles/PMC000xxxxxx"

In [13]:
extract_metadata_from_directory(xml_path_all)

### This part is to check the output of one XML

In [5]:
xml_path_single = "/Users/zeynepkorkmaz/Downloads/separate_XML/PMC176545.xml"

In [6]:
extract_metadata_from_xml(xml_path_single)

{'Title': 'The Transcriptome of the Intraerythrocytic Developmental Cycle of Plasmodium falciparum',
 'Abstract': 'Plasmodium falciparum is the causative agent of the most burdensome form of human malaria, affecting 200–300 million individuals per year worldwide. The recently sequenced genome of P. falciparum revealed over 5,400 genes, of which 60% encode proteins of unknown function. Insights into the biochemical function and regulation of these genes will provide the foundation for future drug and vaccine development efforts toward eradication of this disease. By analyzing the complete asexual intraerythrocytic developmental cycle (IDC) transcriptome of the HB3 strain of P. falciparum, we demonstrate that at least 60% of the genome is transcriptionally active during this stage. Our data demonstrate that this parasite has evolved an extremely specialized mode of transcriptional regulation that produces a continuous cascade of gene expression, beginning with genes corresponding to gene

### Checking length of CSV

In [26]:
import csv

def count_csv_rows(filename):
    with open(filename, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile)
        row_count = sum(1 for row in csv_reader)
    return row_count

# Replace 'your_file.csv' with the path to your CSV file
csv_filename = '/Users/zeynepkorkmaz/Downloads/metadata.csv'
csv_length = count_csv_rows(csv_filename)
print("Length of CSV file:", csv_length)


Length of CSV file: 2993


##### 2993 including the row with the column names

### Checking length of CSV again but using the pandas dataframe

In [14]:
import pandas as pd

In [15]:
metadata = pd.read_csv("/Users/zeynepkorkmaz/Downloads/metadata.csv")

In [25]:
len(metadata)

2992

In [28]:
metadata_without_nan = metadata.dropna(subset=['PMID'])
duplicates = metadata_without_nan[metadata_without_nan.duplicated(subset=['PMID'], keep=False)]

if duplicates.empty:
    print("No duplicates found under PMID column.")
else:
    print("Duplicates found under PMID column (ignoring NaN values):")

No duplicates found under PMID column.
