In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from io import StringIO
from lxml import etree
import xml.etree.ElementTree as ET
import os
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize

In [None]:
def format_xml(path):
    txt = Path(path).read_text()
    xml_data = StringIO(txt)

    df = pd.read_xml(xml_data, 
                    xpath="//dc:title", 
                    namespaces={"dc": "http://purl.org/dc/elements/1.1/", "dcterms":"http://purl.org/dc/terms/", 
                                                            "xsi": "http://www.w3.org/2001/XMLSchema-instance"},
                    parser="lxml")
    df = df.convert_dtypes(convert_string=True)

    txt = Path(path).read_text()
    xml_data = StringIO(txt)

    df2 = pd.read_xml(xml_data, 
                    xpath="//dc:description", 
                    namespaces={"dc": "http://purl.org/dc/elements/1.1/", "dcterms":"http://purl.org/dc/terms/", 
                                                            "xsi": "http://www.w3.org/2001/XMLSchema-instance"},
                    parser="lxml")
    
    df2 = df2.convert_dtypes(convert_string=True)
    df = df.merge(df2, on='lang', how='left')

    df.set_index('lang', inplace=True)
    data = {'title_en': [df['title']['en']], 'title_es': [df['title']['es']], 'description_en': [df['description']['en']], 'description_es': [df['description']['es']]}
    df = pd.DataFrame.from_dict(data)

    df['Source'] = 'MedlinePlus'
    df['File'] = f'{path}'

    return df

In [None]:
def create_dataframes(dir):
    directory = dir

    dataframes = []
    counter = 0

    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            try:
                df = format_xml(f)
                dataframes.append(df)
            except (UnicodeDecodeError, ValueError, KeyError):
                print(f)
                counter += 1

    print(counter)

    final = pd.concat(dataframes, ignore_index=True)
    return final

In [None]:
# final.to_csv('Pubmed.csv', index=False)
final = create_dataframes('Pubmed')

In [None]:
final['NumWordsEn'] = final.apply(lambda x: len((str(x['description_en'])).split()), axis=1)
final['NumWordsEs'] = final.apply(lambda x: len((str(x['description_es'])).split()), axis=1)

In [None]:
print(final)

In [None]:
final['NumWordsEn'].hist()

# add labels and title
plt.xlabel('Num')
plt.ylabel('Frequency')
plt.title('Distribution of Words En Pubmed')

In [None]:
final['NumWordsEs'].hist()

# add labels and title
plt.xlabel('Num')
plt.ylabel('Frequency')
plt.title('Distribution of Words Es Pubmed')