# Author Manuscripts Dataset


In [None]:
import re
import pandas as pd
import tarfile
import xml.etree.ElementTree as ET
import os

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self._data = ''
        
    def handle_data(self, data):
        self._data += ' ' + data

def parse_text(text):
    parser = MyHTMLParser()
    parser.feed(text)
    return parser._data.strip(' ').strip('\n')
    
def process_one_tar(file_dir, file_name):
    infile = f'{file_dir}/{file_name}'
    outfile = '{}/{}'.format(file_dir, file_name.replace('.tar.gz', '.full_text_v2.csv'))
    print('Processing:', infile)

    if os.path.exists(outfile):
        print('Existed:', outfile)
        return
    
    out_data = {'pmc_s':[], 'source_s':[], 'text_txt_en':[]}
    tar = tarfile.open(infile, "r:gz")  # decompress
    for member in tar.getmembers():
        # Decompress tar.gz file
        f = tar.extractfile(member)
    
        # Read the xml file
        tree = ET.parse(f)
        root = tree.getroot()
        
        # Get pmid
        pmc_li = root.findall('front/article-meta/article-id[@pub-id-type="pmc"]')
        assert len(pmc_li) == 1, f'pmc size: {len(pmc)}'
        pmc = pmc_li[0].text
        
        # Find all sections
        sections = []
        for section in root.findall('body/sec'):
            sec_text = parse_text(ET.tostring(section, encoding='unicode'))
            sections.append(sec_text)
        text = '\n'.join(sections).strip()
        
        out_data['pmc_s'].append(f'PMC{pmc}')
        out_data['source_s'].append('author_manuscript')
        out_data['text_txt_en'].append(text)
    
    out_df = pd.DataFrame(out_data)
    out_df.to_csv(outfile, index=False)
    print('Output:', outfile)
    print('Output size:', len(out_df))

import glob
file_dir = '/labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024/'    
files = glob.glob(f'{file_dir}/*.tar.gz')
for file_path in files:
    file_name = file_path.split('/')[-1]
    process_one_tar(file_dir, file_name)

Processing: /labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024//author_manuscript_xml.PMC001xxxxxx.baseline.2023-12-16.tar.gz
Output: /labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024//author_manuscript_xml.PMC001xxxxxx.baseline.2023-12-16.full_text_v2.csv
Output size: 500
Processing: /labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024//author_manuscript_xml.PMC002xxxxxx.baseline.2023-12-16.tar.gz
Output: /labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024//author_manuscript_xml.PMC002xxxxxx.baseline.2023-12-16.full_text_v2.csv
Output size: 92207
Processing: /labs/sarkerlab/yguo262/biomedical_data/pma/manuscript_2024//author_manuscript_xml.PMC003xxxxxx.baseline.2023-12-16.tar.gz


# PMC_OA_subset data

In [None]:
import re
import pandas as pd
import tarfile
import xml.etree.ElementTree as ET
import os

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self._data = ''
        
    def handle_data(self, data):
        self._data += data

def parse_text(text):
    parser = MyHTMLParser()
    parser.feed(text)
    return parser._data.strip(' ').strip('\n')
    
def process_one_tar(file_dir, file_name):
    infile = f'{file_dir}/{file_name}'
    outfile = '{}/{}'.format(file_dir, file_name.replace('.tar.gz', '.sec.csv'))
    print('Processing:', infile)

    if os.path.exists(outfile):
        print('Existed:', outfile)
        return
    
    out_data = {'pmc_s':[], 'source_s':[], 'text_txt_en':[]}
    tar = tarfile.open(infile, "r:gz")  # decompress
    for member in tar.getmembers():
        # Decompress tar.gz file
        f = tar.extractfile(member)
    
        # Read the xml file
        tree = ET.parse(f)
        root = tree.getroot()
        
        # Get pmid
        pmc_li = root.findall('front/article-meta/article-id[@pub-id-type="pmc"]')
        if len(pmc_li) != 1:
            #print('Error pmc:', pmc_li)
            continue
        pmc = pmc_li[0].text
        
        # Get text
        text = ''
        source = ''
        # If there are sub-articles, treat each sub-article as one sample. The pmc would be reformeted using the sub-article ID.
        if len(root.findall('sub-article')) > 0:
            for sub in root.findall('sub-article'):
                sub_id = sub.get('id')
                article_text = []
                for p in sub.findall('body/p'):
                    p_text = parse_text(ET.tostring(p, encoding='unicode'))
                    article_text.append(p_text)
                text = '\n'.join(article_text).strip()
                source = 'sub_article'
                
                if len(text) > 0:
                    out_data['pmc_s'].append(f'{pmc}_{sub_id}')
                    out_data['source_s'].append(f'oa_{source}')
                    out_data['text_txt_en'].append(text)
        
        else:
            if len(root.findall('body/sec')) > 0:
                article_text = []
                for section in root.findall('body/sec'):
                    sec_text = parse_text(ET.tostring(section, encoding='unicode'))
                    article_text.append(sec_text)
                text = '\n'.join(article_text).strip()
                source = 'full_text'
            
            elif len(root.findall('body/p')) > 0:
                article_text = []
                for p in root.findall('body/p'):
                    p_text = parse_text(ET.tostring(p, encoding='unicode'))
                    article_text.append(p_text)
                text = '\n'.join(article_text).strip()
                source = 'full_text'
                
            elif len(root.findall('front/article-meta/abstract/p')) > 0:  
                article_text = []
                for p in root.findall('front/article-meta/abstract/p'):
                    p_text = parse_text(ET.tostring(p, encoding='unicode'))
                    article_text.append(p_text)
                text = '\n'.join(article_text).strip()
                source = 'abstract'
                
            else:
                continue
                #print('No text for pmc', pmc)
                # ET.indent(root)
                # print(ET.tostring(root, encoding='unicode'))
                
            if len(text) > 0:
                out_data['pmc_s'].append(pmc)
                out_data['source_s'].append(f'oa_{source}')
                out_data['text_txt_en'].append(text)
                
    out_df = pd.DataFrame(out_data)
    out_df.to_csv(outfile, index=False)
    print('Output:', outfile)
    print('Output size:', len(out_df))

import glob
file_dir = '/labs/sarkerlab/yguo262/biomedical_data/pmc_oa_subset/raw_2024/oa_noncomm/xml/'    
files = glob.glob(f'{file_dir}/*.tar.gz')
#files = glob.glob(f'{file_dir}/oa_noncomm_xml.PMC001xxxxxx.baseline.2023-12-18.tar.gz')
for file_path in files:
    file_name = file_path.split('/')[-1]
    process_one_tar(file_dir, file_name)