In [172]:
import pandas as pd
import xml.etree.ElementTree as ET
import re
import os
import time
pd.set_option('display.max_rows', 500)

In [1]:
XML_input = input() 

 E:\PMIDsMinimalExampleParsingXMLs\200.txt


In [3]:
DF_output = input()

  E:\PMIDsMinimalExampleParsingXMLs.csv


In [73]:
tree = ET.parse(XML_input)
root = tree.getroot()

In [157]:
data = []
for node in root:
    article_data = {}
    if node.tag == 'PubmedArticle':
        # Type of entry
        article_data['Type'] = 'Article'

        # PMID
        pmid_node = node.find('./MedlineCitation/PMID')
        article_data['PMID'] = pmid_node.text.strip() if pmid_node is not None else None

        # Journal
        journal_node = node.find('./MedlineCitation/Article/Journal/Title')
        article_data['Journal'] = journal_node.text.strip() if journal_node is not None else None

        # Title
        title_node = node.find('./MedlineCitation/Article/ArticleTitle')
        article_data['Title'] = title_node.text.strip() if title_node is not None else None

        # Abstract
        abstract_node = node.find('./MedlineCitation/Article/Abstract')
        if abstract_node is not None:
            abstract_text = ''
            for elem in abstract_node.iter():
                if elem.text:
                    abstract_text += elem.text.strip() + ' '
            article_data['Abstract'] = abstract_text.strip()
        else:
            article_data['Abstract'] = None

        # Authors list
        authors = []
        for author in node.findall('./MedlineCitation/Article/AuthorList/Author'):
            author_info = {}
            forename_node = author.find('./ForeName')
            lastname_node = author.find('./LastName')
            if forename_node is not None and lastname_node is not None:
                author_info['Name'] = forename_node.text.strip() + ' ' + lastname_node.text.strip()
            else:
                continue

            affiliations = []
            for affiliation in author.findall('./AffiliationInfo/Affiliation'):
                affiliations.append(affiliation.text.strip())

            author_info['Affiliation'] = affiliations
            authors.append(author_info)
        article_data['Authors'] = authors

        # MeshHeadings
        mesh_headings = []
        for mesh_heading in node.findall('./MedlineCitation/MeshHeadingList/MeshHeading'):
            descriptor_name = mesh_heading.find('./DescriptorName').text.strip()
            mesh_headings.append(descriptor_name)
        article_data['MeshHeadings'] = ', '.join(mesh_headings) if mesh_headings else None

        # Chemicals
        chemicals = []
        for chemical in node.findall('./MedlineCitation/ChemicalList/Chemical'):
            chemical_name = chemical.find('./NameOfSubstance').text.strip()
            chemicals.append(chemical_name)
        article_data['Chemicals'] = ', '.join(chemicals) if chemicals else None

        # Publication types
        publication_types = []
        for publication_type in node.findall('./MedlineCitation/Article/PublicationTypeList/PublicationType'):
            publication_types.append(publication_type.text.strip())
        article_data['PublicationTypes'] = ', '.join(publication_types) if publication_types else None

        # Publication date
        pub_date_node = node.find('./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
        if pub_date_node is not None:
            article_data['PublicationDate'] = pub_date_node.text.strip()
        else:
            article_data['PublicationDate'] = None

        # Language
        language_node = node.find('./MedlineCitation/Article/Language')
        article_data['Language'] = language_node.text.strip() if language_node is not None else None

        # Keywords
        keywords = []
        for keyword in node.findall('./MedlineCitation/KeywordList/Keyword'):
            keywords.append(keyword.text.strip())
        article_data['Keywords'] = ', '.join(keywords) if keywords else None
        
    
    elif node.tag == 'PubmedBookArticle':
        # Type of entry
        article_data['Type'] = 'Book Article'

        # PMID
        pmid_node = node.find('./BookDocument/PMID')
        article_data['PMID'] = pmid_node.text.strip() if pmid_node is not None else None

        # Book Publisher Name
        publisher_node = node.find('./BookDocument/Book/Publisher/PublisherName')
        article_data['Publisher'] = publisher_node.text.strip() if publisher_node is not None else None

        # Book title
        book_title_node = node.find('./BookDocument/Book/BookTitle')
        article_data['BookTitle'] = book_title_node.text.strip() if book_title_node is not None else None
    
        # Title
        title_node = node.find('./BookDocument/ArticleTitle')
        if title_node != None and title_node.find("./i") != None:
            article_data["Title"] = ''.join(text.strip() for text in title_node.itertext() if text.strip())
        elif title_node != None and title_node.find("./i") == None:
            article_data['Title'] = title_node.text.strip()
        else:
            article_data['Title'] = None

        # Abstract
        abstract_node = node.find('./BookDocument/Abstract')
        if abstract_node is not None:
            #article_data['Abstract'] = ''.join(text.strip() for text in abstract_node.itertext() if text.strip())
            article_data['Abstract'] = ''.join(
                text.strip() if child.tag != 'i' else re.sub("\n", "", " "+text.strip()+" ")
                for child in abstract_node.iter() if child.tag != "CopyrightInformation"
                for text in ([child.text] if child.text else []) + ([child.tail] if child.tail else [])
            )
        else:
            article_data['Abstract'] = None

        # Authors list
        authors = []
        for author in node.findall('./BookDocument/AuthorList[@Type="authors"]/Author'):
            author_info = {}
            forename_node = author.find('./ForeName')
            lastname_node = author.find('./LastName')
            if forename_node is not None and lastname_node is not None:
                author_info['Name'] = forename_node.text.strip() + ' ' + lastname_node.text.strip()
            else:
                continue

            affiliations = []
            for affiliation in author.findall('./AffiliationInfo/Affiliation'):
                affiliations.append(affiliation.text.strip())

            author_info['Affiliation'] = affiliations
            authors.append(author_info)
        article_data['Authors'] = authors

        # Publication date
        pub_date_node = node.find('./BookDocument/Book/PubDate/Year')
        if pub_date_node is not None:
            article_data['PublicationDate'] = pub_date_node.text.strip()
        else:
            article_data['PublicationDate'] = None

        # Language
        language_node = node.find('./BookDocument/Language')
        article_data['Language'] = language_node.text.strip() if language_node is not None else None

        # Keywords
        keywords = []
        for keyword in node.findall('./BookDocument/KeywordList/Keyword'):
            keywords.append(keyword.text.strip())
        article_data['Keywords'] = ', '.join(keywords) if keywords else None

    else:
        continue

    
    data.append(article_data)

In [158]:
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,Type,PMID,Publisher,BookTitle,Title,Abstract,Authors,PublicationDate,Language,Keywords,Journal,MeshHeadings,Chemicals,PublicationTypes
0,Book Article,20301606,"University of Washington, Seattle",GeneReviews,Laing Distal Myopathy,Laing distal myopathy is characterized by earl...,"[{'Name': 'Phillipa Lamont', 'Affiliation': ['...",1993,eng,"Laing Early-Onset Distal Myopathy, Laing Early...",,,,
1,Book Article,20301607,"University of Washington, Seattle",GeneReviews,Genetic Hearing Loss Overview,The purpose of this GeneReview is to: 1.. Des...,"[{'Name': 'A Eliot Shearer', 'Affiliation': ['...",1993,eng,"Nonsyndromic Hearing Loss and Deafness, Actin,...",,,,
2,Book Article,20301608,"University of Washington, Seattle",GeneReviews,Alpha-Thalassemia,Alpha-thalassemia (α-thalassemia) has two clin...,"[{'Name': 'Hannah Tamary', 'Affiliation': ['He...",1993,eng,Hemoglobin Bart Hydrops Fetalis (Hb Bart) Synd...,,,,
3,Book Article,20301611,"University of Washington, Seattle",GeneReviews,Spinocerebellar Ataxia Type 17,Spinocerebellar ataxia type 17 (SCA17) is char...,"[{'Name': 'Yasuko Toyoshima', 'Affiliation': [...",1993,eng,"Huntington Disease-Like 4, SCA17, SCA17, Hunti...",,,,
4,Book Article,20301613,"University of Washington, Seattle",GeneReviews,HFE-Related Hemochromatosis,HFE- related hemochromatosis ( HFE HC) is ...,"[{'Name': 'James C Barton', 'Affiliation': ['U...",1993,eng,"HFE-Associated Hemochromatosis, HFE-Associated...",,,,


In [None]:
df.to_csv(DF_output)

In [160]:
XML_input = input() 

 E:\PMIDsMinimalExampleParsingXMLs\


In [161]:
XML_files = os.listdir(XML_input)

In [173]:
data_scale = []

start = time.time()

for el in XML_files:
    print(el)
    tree = ET.parse(XML_input+el)
    root = tree.getroot()

    for node in root:
        article_data = {}
        if node.tag == 'PubmedArticle':
            # Type of entry
            article_data['Type'] = 'Article'

            # PMID
            pmid_node = node.find('./MedlineCitation/PMID')
            article_data['PMID'] = pmid_node.text.strip() if pmid_node is not None else None

            # Journal
            journal_node = node.find('./MedlineCitation/Article/Journal/Title')
            article_data['Journal'] = journal_node.text.strip() if journal_node is not None else None

            # Title
            title_node = node.find('./MedlineCitation/Article/ArticleTitle')
            article_data['Title'] = title_node.text.strip() if title_node is not None else None

            # Abstract
            abstract_node = node.find('./MedlineCitation/Article/Abstract')
            if abstract_node is not None:
                abstract_text = ''
                for elem in abstract_node.iter():
                    if elem.text:
                        abstract_text += elem.text.strip() + ' '
                article_data['Abstract'] = abstract_text.strip()
            else:
                article_data['Abstract'] = None

            # Authors list
            authors = []
            for author in node.findall('./MedlineCitation/Article/AuthorList/Author'):
                author_info = {}
                forename_node = author.find('./ForeName')
                lastname_node = author.find('./LastName')
                if forename_node is not None and lastname_node is not None:
                    author_info['Name'] = forename_node.text.strip() + ' ' + lastname_node.text.strip()
                else:
                    continue

                affiliations = []
                for affiliation in author.findall('./AffiliationInfo/Affiliation'):
                    affiliations.append(affiliation.text.strip())

                author_info['Affiliation'] = affiliations
                authors.append(author_info)
            article_data['Authors'] = authors

            # MeshHeadings
            mesh_headings = []
            for mesh_heading in node.findall('./MedlineCitation/MeshHeadingList/MeshHeading'):
                descriptor_name = mesh_heading.find('./DescriptorName').text.strip()
                mesh_headings.append(descriptor_name)
            article_data['MeshHeadings'] = ', '.join(mesh_headings) if mesh_headings else None

            # Chemicals
            chemicals = []
            for chemical in node.findall('./MedlineCitation/ChemicalList/Chemical'):
                chemical_name = chemical.find('./NameOfSubstance').text.strip()
                chemicals.append(chemical_name)
            article_data['Chemicals'] = ', '.join(chemicals) if chemicals else None

            # Publication types
            publication_types = []
            for publication_type in node.findall('./MedlineCitation/Article/PublicationTypeList/PublicationType'):
                publication_types.append(publication_type.text.strip())
            article_data['PublicationTypes'] = ', '.join(publication_types) if publication_types else None

            # Publication date
            pub_date_node = node.find('./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
            if pub_date_node is not None:
                article_data['PublicationDate'] = pub_date_node.text.strip()
            else:
                article_data['PublicationDate'] = None

            # Language
            language_node = node.find('./MedlineCitation/Article/Language')
            article_data['Language'] = language_node.text.strip() if language_node is not None else None

            # Keywords
            keywords = []
            for keyword in node.findall('./MedlineCitation/KeywordList/Keyword'):
                keywords.append(keyword.text.strip())
            article_data['Keywords'] = ', '.join(keywords) if keywords else None
        
    
        elif node.tag == 'PubmedBookArticle':
            # Type of entry
            article_data['Type'] = 'Book Article'

            # PMID
            pmid_node = node.find('./BookDocument/PMID')
            article_data['PMID'] = pmid_node.text.strip() if pmid_node is not None else None

            # Book Publisher Name
            publisher_node = node.find('./BookDocument/Book/Publisher/PublisherName')
            article_data['Publisher'] = publisher_node.text.strip() if publisher_node is not None else None

            # Book title
            book_title_node = node.find('./BookDocument/Book/BookTitle')
            article_data['BookTitle'] = book_title_node.text.strip() if book_title_node is not None else None
    
            # Title
            title_node = node.find('./BookDocument/ArticleTitle')
            if title_node != None and title_node.find("./i") != None:
                article_data["Title"] = ''.join(text.strip() for text in title_node.itertext() if text.strip())
            elif title_node != None and title_node.find("./i") == None:
                article_data['Title'] = title_node.text.strip()
            else:
                article_data['Title'] = None

            # Abstract
            abstract_node = node.find('./BookDocument/Abstract')
            if abstract_node is not None:
                #article_data['Abstract'] = ''.join(text.strip() for text in abstract_node.itertext() if text.strip())
                article_data['Abstract'] = ''.join(
                    text.strip() if child.tag != 'i' else re.sub("\n", "", " "+text.strip()+" ")
                    for child in abstract_node.iter() if child.tag != "CopyrightInformation"
                    for text in ([child.text] if child.text else []) + ([child.tail] if child.tail else [])
                )
            else:
                article_data['Abstract'] = None

            # Authors list
            authors = []
            for author in node.findall('./BookDocument/AuthorList[@Type="authors"]/Author'):
                author_info = {}
                forename_node = author.find('./ForeName')
                lastname_node = author.find('./LastName')
                if forename_node is not None and lastname_node is not None:
                    author_info['Name'] = forename_node.text.strip() + ' ' + lastname_node.text.strip()
                else:
                    continue

                affiliations = []
                for affiliation in author.findall('./AffiliationInfo/Affiliation'):
                    affiliations.append(affiliation.text.strip())

                author_info['Affiliation'] = affiliations
                authors.append(author_info)
            article_data['Authors'] = authors

            # Publication date
            pub_date_node = node.find('./BookDocument/Book/PubDate/Year')
            if pub_date_node is not None:
                article_data['PublicationDate'] = pub_date_node.text.strip()
            else:
                article_data['PublicationDate'] = None

            # Language
            language_node = node.find('./BookDocument/Language')
            article_data['Language'] = language_node.text.strip() if language_node is not None else None

            # Keywords
            keywords = []
            for keyword in node.findall('./BookDocument/KeywordList/Keyword'):
                keywords.append(keyword.text.strip())
            article_data['Keywords'] = ', '.join(keywords) if keywords else None

        else:
            continue

    
        data_scale.append(article_data)

print("Parsing time: "+str(time.time()-start)+" seconds")

0.txt
1000.txt
10000.txt
10200.txt
10400.txt
10600.txt
10800.txt
11000.txt
11200.txt
11400.txt
11600.txt
11800.txt
1200.txt
12000.txt
12200.txt
12400.txt
12600.txt
12800.txt
13000.txt
13200.txt
13400.txt
13600.txt
13800.txt
1400.txt
14000.txt
14200.txt
14400.txt
14600.txt
14800.txt
15000.txt
15200.txt
15400.txt
15600.txt
15800.txt
1600.txt
16000.txt
16200.txt
16400.txt
16600.txt
16800.txt
17000.txt
17200.txt
17400.txt
17600.txt
17800.txt
1800.txt
18000.txt
18200.txt
18400.txt
18600.txt
18800.txt
19000.txt
19200.txt
19400.txt
19600.txt
19800.txt
200.txt
2000.txt
2200.txt
2400.txt
2600.txt
2800.txt
3000.txt
3200.txt
3400.txt
3600.txt
3800.txt
400.txt
4000.txt
4200.txt
4400.txt
4600.txt
4800.txt
5000.txt
5200.txt
5400.txt
5600.txt
5800.txt
600.txt
6000.txt
6200.txt
6400.txt
6600.txt
6800.txt
7000.txt
7200.txt
7400.txt
7600.txt
7800.txt
800.txt
8000.txt
8200.txt
8400.txt
8600.txt
8800.txt
9000.txt
9200.txt
9400.txt
9600.txt
9800.txt
Parsing time: 13.33194613456726 seconds
