In [1]:
import os
import csv
from lxml import etree as ET
from bs4 import BeautifulSoup

### This function takes an XML file as input, parses it for "Title, abstract, PMID, PublicationDate" and returns these values in a dictionary.

In [2]:
def read_author_csv(csv_file):
    author_list = []

    with open(csv_file, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row if it exists

        for row in csv_reader:
            author = row[0].strip()  # Assuming the author names are in the first column
            author_list.append(author)

    return author_list

In [3]:
author_csv_file = 'crc_authors.csv'
authors = read_author_csv(author_csv_file)

In [23]:
authors

['Antonio Enrico Zaurito',
 'Markus Tschurtschenthaler',
 'Verena Friedrich',
 'Ignasi Forné',
 'Dana Matzek',
 'Diana Ring',
 'Bastian Popper',
 'Lara Jochum',
 'Stefanie Spriewald',
 'Tobias Straub',
 'Axel Imhof',
 'Anne Krug',
 'Bärbel Stecher',
 'Thomas Brocker',
 'N. van Best',
 'U. Rolle-Kampczyk',
 'F. G. Schaap',
 'M. Basic',
 'S. W. M. Olde Damink',
 'A. Bleich',
 'P. H. M. Savelkoul',
 'M. von Bergen',
 'J. Penders',
 'M. W. Hornef',
 'Tanja Groll',
 'Franziska Schopf',
 'Daniela Denk',
 'Carolin Mogler',
 'Ulrike Schwittlick',
 'Heike Aupperle-Lellbach',
 'Sabrina Rim Jahan Sarker',
 'Nicole Pfarr',
 'Wilko Weichert',
 'Kaspar Matiasek',
 'Moritz Jesinghaus',
 'Katja Steiger',
 'Michela Carlet',
 'Kerstin Völse',
 'Jenny Vergalli',
 'Martin Becker',
 'Tobias Herold',
 'Anja Arner',
 'Daniela Senft',
 'Vindi Jurinovic',
 'Wen-Hsin Liu',
 'Yuqiao Gao',
 'Veronika Dill',
 'Boris Fehse',
 'Claudia D. Baldus',
 'Lorenz Bastian',
 'Lennart Lenk',
 'Denis M. Schewe',
 'Johannes W.

In [4]:
def extract_metadata_from_xml(xml_path, author_list):
    # Initialize the XMLParser with recover option
    parser = ET.XMLParser(recover=True)
    tree = ET.parse(xml_path, parser)
    root = tree.getroot()

    # Safely extracting authors
    authors = extract_authors_from_xml(root)

    # Safely extracting title with all child elements
    title_element = root.find(".//title-group/article-title")
    title = ''.join(title_element.itertext()).strip() if title_element is not None else ""

    # Safely extracting abstract, including all sections and child elements
    abstract_elements = root.findall(".//abstract//p")
    abstract = " ".join([''.join(elem.itertext()).strip() for elem in abstract_elements if elem is not None])

    # Safely extracting PMID
    pmid_element = root.find(".//article-id[@pub-id-type='pmid']")
    pmid = pmid_element.text.strip() if pmid_element is not None and pmid_element.text else ""

    # Safely extracting publication date
    pub_date_element = root.find(".//pub-date/year")
    pub_date = pub_date_element.text.strip() if pub_date_element is not None and pub_date_element.text else ""

    # Check if any author is in the author list
    crc_value = "Yes" if any(author in author_list for author in authors) else "No"

    return {
        'Title': title,
        'Abstract': abstract,
        'PMID': pmid,
        'PublicationDate': pub_date,
        'CRC': crc_value
    }

### This goes through a directory of XML files, runs the previous function for each XML and saves the results in a CSV file.

In [5]:
def extract_metadata_from_directory(xml_directory, author_list):
    # CHANGE THE OUTPUT FILE PATH HERE
    csv_file_path = "crc_training_extended.csv"
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Abstract", "PMID", "PublicationDate", "CRC"])
        writer.writeheader()

        # Loop through each XML file in the directory
        for filename in os.listdir(xml_directory):
            if filename.endswith('.xml'):
                xml_path = os.path.join(xml_directory, filename)
                metadata = extract_metadata_from_xml(xml_path, author_list)
                writer.writerow(metadata)

In [6]:
def extract_authors_from_xml(root):
    authors = []
    for author_tag in root.findall('.//contrib[@contrib-type="author"]'):
        author_name = author_tag.find('name')
        if author_name is not None:
            given_names = author_name.find('given-names').text.strip()
            surname = author_name.find('surname').text.strip()
            authors.append(f"{given_names} {surname}")
    return authors

### Specify the directory to the XML files

In [18]:
xml_path_all = "C:/OneDrive/Dokumente/Master_1. Semester/Systems_Biomedicine/NLP/nlp_tool_training_papers_extended_xml"

In [19]:
extract_metadata_from_directory(xml_path_all, authors)

FileNotFoundError: [WinError 3] Das System kann den angegebenen Pfad nicht finden: 'C:/OneDrive/Dokumente/Master_1. Semester/Systems_Biomedicine/NLP/nlp_tool_training_papers_extended_xml'

In [9]:
os.getcwd()

'C:\\Users\\sabri\\OneDrive\\Dokumente\\Master_1. Semester\\Systems_Biomedicine\\NLP'

### Checking length of CSV

In [26]:
import csv

def count_csv_rows(filename):
    with open(filename, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile)
        row_count = sum(1 for row in csv_reader)
    return row_count

# Replace 'your_file.csv' with the path to your CSV file
csv_filename = '/Users/zeynepkorkmaz/Downloads/metadata.csv'
csv_length = count_csv_rows(csv_filename)
print("Length of CSV file:", csv_length)


Length of CSV file: 2993


##### 2993 including the row with the column names

### Checking length of CSV again but using the pandas dataframe

In [14]:
import pandas as pd

In [15]:
metadata = pd.read_csv("/Users/zeynepkorkmaz/Downloads/metadata.csv")

In [25]:
len(metadata)

2992

In [28]:
metadata_without_nan = metadata.dropna(subset=['PMID'])
duplicates = metadata_without_nan[metadata_without_nan.duplicated(subset=['PMID'], keep=False)]

if duplicates.empty:
    print("No duplicates found under PMID column.")
else:
    print("Duplicates found under PMID column (ignoring NaN values):")

No duplicates found under PMID column.
