# Initial Data Aggregation and Preprocessing Pipeline

The general premise of this project is that I am attempting to use citation networks to suggest MeSH terms that can be applied to PubMed citations. For example, if an article indexed on PubMed has been annotated with the MeSH term "Neoadjuvant Therapy", it is quite possible or even likely that some or many of its references have also been annotated with that "Neoadjuvant Therapy".

During the initial stages of the project, I chose to use NCBI APIs to retrieve the necessary documents. Later on, I encorporate the entire PubMed and PMC corpuses.

I used the 2013 MTI ML dataset as a starting point. Building citation networks requires full texts, so I was not able to attempt term prediction for every document in the dataset. Thus, my approach to the initial data aggregation and preprocessing is as follows:

1. Subset 2013 MTI ML dataset for only those PMIDs that are also in PMC's Open Access subset
2. Pull full text XMLs from PMC for articles in step 1
3. Extract the references from full texts
4. For each extracted reference, pull PubMed citations in XML format (which contain MeSH term annotations) from PubMed using the Pubmed API
5. Extract the MeSH terms from each of the PubMed citations

## Subset 2013 MTI ML dataset and pull full texts from PMC

In [None]:
import re
import time
import logging
from pathlib import Path

import numpy as np
from bs4 import BeautifulSoup
import xmltodict
from Bio import Entrez
import pandas as pd
from tqdm import tqdm_notebook

In [None]:
# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("pmc_api_pull.log")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
with open("../ncbi.key") as handle:
    api_key = handle.read()

# Add source for oa_file_list here
oa_list = pd.read_csv("../data/oa_file_list.csv")

# Subset the 2013 MTI dataset for only those PMIDs that
# are also in the PMC Open Access file list
# I really do not like that I am using Pandas here, will refactor later when
# there is time to eliminate this dependency and slightly increase performance
with open("../data/PMIDs_train", "r") as fp:
    mti_train = fp.readlines()
    mti_train = pd.DataFrame({'PMID':mti_train})

with open("../data/PMIDs_test", "r") as fp:
    mti_test = fp.readlines()
    mti_test = pd.DataFrame({'PMID':mti_test})

mti_subset_train = oa_list[(oa_list.PMID.isin(mti_train.PMID))]
mti_subset_train.to_csv("2013_MTI_in_OA_train.csv")

mti_subset_test = oa_list[(oa_list.PMID.isin(mti_test.PMID))]
mti_subset_test.to_csv("2013_MTI_in_OA_test.csv")

ids_to_get = mti_subset_train["Accession ID"].tolist() + mti_subset_test["Accession ID"].tolist()

# Save full texts for each PMC ID
for pmcid in tqdm_notebook(ids_to_get):
    start_time = time.perf_counter()
    file = Path(f"../pmc_xmls/{pmcid}.xml")
    if not file.exists():
        Entrez.email = "kgasper@unomaha.edu"
        Entrez.api_key = api_key
        handle = Entrez.efetch(db="pmc", id=pmcid, retmode="xml")
        xmlString = handle.read()
        element = xmltodict.parse(xmlString)
    
        pmc_error = False
    
        # Check for an error on PMC's side and record it
        for key in element['pmc-articleset'].keys():
            if key == 'error':
                logger.error(f"PMC API error - ID: {pmcid}")
                pmc_error = True
    
        if not pmc_error:
            with open(file, "w") as file_out:
                file_out.write(xmlString)
        
        # This is a delay in accordance with PubMed API usage guidelines
        # PubMed allows 3 requests/sec without API key or 10 req/sec with
        if time.perf_counter() - start_time < .33:
            time.sleep(.33 - (time.perf_counter() - start_time))

## Extract references from full texts

The primary goal here is to extract identifiers (DOI or PMID) for each reference in each article and use these identifiers to created an edge list. 

In [None]:
accessions = []

# You might notice the "nohead" in the filename, I removed the header for this file
# using tail
with open("../data/2013_MTI_in_OA_train_nohead.csv", "r") as handle:
    for line in handle:
        line = line.split(",")
        accessions.append(line[3])

with open("../data/2013_MTI_in_OA_test_nohead.csv", "r") as handle:
    for line in handle:
        line = line.split(",")
        accessions.append(line[3])

# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler("reference_extraction.log")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

# List for the references
mti_refs = []

# Extract references from the XML files
for ID in tqdm(accessions):
    try:
        with open(f"../pmc_xmls/{ID}.xml", "r") as handle:
            soup = BeautifulSoup(handle.read())
            
            sample = [ID]
            
            # add IDs to the error log if they don't have the 'back' tag and to 
            # the samples list if they do
            if soup.back is None:
                logger.error(f"No refs: {ID}")
            elif soup.back is not None:
                for pubid in soup.back.find_all('pub-id'):
                    sample.append(pubid.string)
                mti_refs.append(sample)
    except FileNotFoundError:
        logger.error(f"FNFE: {ID}")

# Create dicts for ID conversions
# Here I want to convert my PMCIDs back to PMIDs and convert any DOIs to
# PMIDs whenever possible. The DOI -> PMID thing is not extremely important,
# because in general PMC is good about adding PMIDs to articles' references 
# as they become available, but I wanted to be thorough 
dois = {}
pmcids = {}
with open("../data/PMC-ids-nohead.csv", "r") as handle:
    for line in handle:
        line = line.split(",")
        if len(line) > 9:
            if line[7]:
                dois[line[7]] = line[9]
            pmcids[line[8]] = line[9]

# This function converts a DOI or PMCID to a PMID
def fetch_pmid(identifier, dois, pmcids, logger):
    pmid = ""
    if re.match("^10\..*$", identifier):
        if identifier in dois.keys():
            pmid = dois[identifier]
        return pmid if pmid else np.NaN

    if re.match("^PMC.*$", identifier) and identifier in pmcids.keys():
        pmid = pmcids[identifier]
        if pmid:
            return pmid
        else:
            logger.error(f"PMCID conversion error: {identifier}")
            return identifier
    
    # Return original identifier if not a DOI or PMCID
    return identifier

# Convert IDs to PMIDs if possible
for sample in mti_refs:
    for index in range(len(sample)):
        sample[index] = fetch_pmid(sample[index], dois, pmcids, logger)

edge_list = []

# Convert to edge list format and drop non-PMID identifiers:
for sample in mti_refs:
    for index in range(1, len(sample)):
        if sample[index] is not np.NaN and re.match("^\d*$", sample[index]):
            edge_list.append((sample[0], str(sample[index])))

# Remove duplicates:
edge_list = list(set(edge_list))
edge_list.sort()

# Write output
with open("../data/edge_list.csv", "w") as out:
    for edge in edge_list:
        out.write("".join([edge[0], ",", edge[1], "\n"]))