In [217]:
import pandas as pd
import numpy as np
from pathlib import Path
from io import StringIO
from lxml import etree
import xml.etree.ElementTree as ET
import os
import re

In [218]:
def format_xml(path):
    txt = Path(path).read_text()
    xml_data = StringIO(txt)

    df = pd.read_xml(xml_data, 
                    xpath="//dc:title", 
                    namespaces={"dc": "http://purl.org/dc/elements/1.1/", "dcterms":"http://purl.org/dc/terms/", 
                                                            "xsi": "http://www.w3.org/2001/XMLSchema-instance"},
                    parser="lxml")
    df = df.convert_dtypes(convert_string=True)

    txt = Path(path).read_text()
    xml_data = StringIO(txt)

    df2 = pd.read_xml(xml_data, 
                    xpath="//dc:description", 
                    namespaces={"dc": "http://purl.org/dc/elements/1.1/", "dcterms":"http://purl.org/dc/terms/", 
                                                            "xsi": "http://www.w3.org/2001/XMLSchema-instance"},
                    parser="lxml")
    df2 = df2.convert_dtypes(convert_string=True)
    df = df.merge(df2, on='lang', how='left')

    df.set_index('lang', inplace=True)
    data = {'title_en': [df['title']['en']], 'title_es': [df['title']['es']], 'description_en': [df['description']['en']], 'description_es': [df['description']['es']]}
    df = pd.DataFrame.from_dict(data)

    df['Source'] = 'MedlinePlus'
    df['File'] = f'{path}'

    return df

In [219]:
directory = 'MedlinePlus\dublin_core'

dataframes = []
counter = 0

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        try:
            df = format_xml(f)
            dataframes.append(df)
        except (UnicodeDecodeError, ValueError):
            print(f)
            counter += 1

print(counter)

final = pd.concat(dataframes, ignore_index=True)

MedlinePlus\dublin_core\ibc-036170.xml
MedlinePlus\dublin_core\ibc-036336.xml
MedlinePlus\dublin_core\ibc-036431.xml
MedlinePlus\dublin_core\ibc-036543.xml
MedlinePlus\dublin_core\ibc-036560.xml
MedlinePlus\dublin_core\ibc-036579.xml
MedlinePlus\dublin_core\ibc-036684.xml
MedlinePlus\dublin_core\ibc-036711.xml
MedlinePlus\dublin_core\ibc-036728.xml
MedlinePlus\dublin_core\ibc-036804.xml
MedlinePlus\dublin_core\ibc-036840.xml
MedlinePlus\dublin_core\ibc-036933.xml
MedlinePlus\dublin_core\ibc-037078.xml
MedlinePlus\dublin_core\ibc-037095.xml
MedlinePlus\dublin_core\ibc-037173.xml
MedlinePlus\dublin_core\ibc-037217.xml
MedlinePlus\dublin_core\ibc-037232.xml
MedlinePlus\dublin_core\ibc-037305.xml
MedlinePlus\dublin_core\ibc-037527.xml
MedlinePlus\dublin_core\ibc-037561.xml
MedlinePlus\dublin_core\ibc-037582.xml
MedlinePlus\dublin_core\ibc-037609.xml
MedlinePlus\dublin_core\ibc-037622.xml
MedlinePlus\dublin_core\ibc-037626.xml
MedlinePlus\dublin_core\ibc-037643.xml
MedlinePlus\dublin_core\i

In [None]:
final.to_csv('Pubmed.csv', index=False)