In [None]:
import os
import re
import time
from pathlib import Path

import pandas as pd
import numpy as np
import xmltodict
from Bio import Entrez
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup

In order to easily extract references from articles, I first needed to determine which articles are available as full texts in the PMC Open Access subset. To do this, I downloaded the list of all articles in the Open Access subset and determined which PMIDs from the MTI ML Dataset are also in the Open Access subset.

In [None]:
oa_list = pd.read_csv("oa_file_list.csv")

mti_train = open("2013_MTI_ML_DataSet/PMIDs_train", "r")
mti_train = mti_train.readlines()
mti_train = pd.DataFrame({'PMID':mti_train})

mti_test = open("2013_MTI_ML_DataSet/PMIDs_test", "r")
mti_test = mti_test.readlines()
mti_test = pd.DataFrame({'PMID':mti_test})

mti_oaSubset_train = oa_list[(oa_list.PMID.isin(mti_train.PMID))]
mti_oaSubset_train.to_csv("2013_MTI_in_OA_train.csv", index=False)

mti_oaSubset_test = oa_list[(oa_list.PMID.isin(mti_test.PMID))]
mti_oaSubset_test.to_csv("2013_MTI_in_OA_test.csv", index=False)

Next, I downloaded the full texts for all PMIDs in the 2013 MTI ML Dataset that are also in PMC's Open Access subset (determined above).

In [None]:
mti_train = pd.read_csv("./FUSE/2013_MTI_in_OA_train.csv")
ids_to_get = mti_train["Accession ID"].tolist()

# PMC_errors stores any errors from PMC's side
pmc_errors = []

for ID in tqdm_notebook(ids_to_get):
    start_time = time.perf_counter()
    file = Path("./PMC XMLs/{}.xml".format(ID))
    if not file.exists():
        Entrez.email = "kgasper@unomaha.edu"
        handle = Entrez.efetch(db="pmc", id=ID, retmode="xml")
        xmlString = handle.read()
        element = xmltodict.parse(xmlString)
    
        pmc_error = False
    
        # Check for an error on PMC's side and record it
        for key in element['pmc-articleset'].keys():
            if key == 'error':
                pmc_errors.append(ID)
                pmc_error = True
    
        if not pmc_error:
            file_out = open("./PMC XMLs/{}.xml".format(ID), "w")
            file_out.write(xmlString)
            
        if time.perf_counter() - start_time < .33:
            time.sleep(.33 - (time.perf_counter() - start_time))

References were extracted from full text XML files using the Beautiful Soup package. Initially, I tried to use xmltodict to do this, which ended up being very difficult. However, I was exposed to all the different formats of the full text XMLs and was able to see that references are always under the "back" tag, and they always have the "pub-id" identifier in their tags. 

In [None]:
mti_oaSubset_train = pd.read_csv("2013_MTI_in_OA_train.csv")

# List for the references
mti_refs = [[]]

# This list is for IDs that don't have the 'back' tag, to investigate later.
ids_to_check = []

# FileNotFoundErrors
fnfe = []

# Extract references from the XML files
for ID in tqdm(mti_oaSubset_train['Accession ID']):
    try:
        handle = open("./PMC XMLs/{}.xml".format(ID), "r")
        soup = BeautifulSoup(handle.read())
        
        sample = [ID]
        
        # add IDs to the error list if they don't have the 'back' tag and to 
        # the samples list if they do
        if soup.back is None:
            ids_to_check.append(ID)
        elif soup.back is not None:
            for pubid in soup.back.find_all('pub-id'):
                sample.append(pubid.string)
            
            mti_refs.append(sample)
    except FileNotFoundError:
        fnfe.append(ID)
    
mti_refs = pd.DataFrame(mti_refs)

# Read in PMC_IDs to convert all the DOIs to PMIDs:
PMC_ids = pd.read_csv("PMC-ids.csv", low_memory=False)

# Drop unneeded columns
DOI_PMIDs = PMC_ids.drop(["Journal Title", "ISSN", "eISSN", "Year", "Volume",
                         "Issue", "Page", "PMCID", "Manuscript Id", 
                         "Release Date"], axis=1)

# Change PMIDs from float64 in scientific notation to str
DOI_PMIDs.PMID = DOI_PMIDs.PMID.fillna(0)
DOI_PMIDs.PMID = DOI_PMIDs.PMID.astype(int).astype(str)
DOI_PMIDs.PMID = DOI_PMIDs.PMID.replace("0", "NA")

# Find DOIs and convert them to PMIDs if possible
for row in tqdm(range(0, len(mti_refs))):
    for col in range(0, len(mti_refs.columns)):
        if re.match(r"^[1][0][.]..*$", str(mti_refs.iloc[row, col])):
            result = DOI_PMIDs[DOI_PMIDs.DOI == mti_refs.iloc[row, col]].PMID
            if len(result) == 1:
                 mti_refs.iloc[row, col] = result.item()
            if len(result) == 0:
                mti_refs.iloc[row, col] = np.NaN
            
# Remove IDs in the format "2-s......."
mti_refs = mti_refs.replace("^2[-]s..*$", np.NaN, regex=True)

# Make edge list by melting the DF. Drop unnecessary column and NAs
edge_list = pd.melt(mti_refs, id_vars=['0'], 
                    value_vars=mti_refs.loc[:, 
                    mti_refs.columns != '0'],
                    value_name='1')

###### deal with this:
edge_list = edge_list.replace(" 10.1007/s11606-011-1968-2", "22282311")
edge_list = edge_list.replace("120/4/e902", "17908746")
edge_list = edge_list.replace("121/3/575", "18310208")
edge_list = edge_list.replace("353/5/487", "16079372")
edge_list = edge_list.replace("163/2/141", "19188646")
edge_list = edge_list.replace("13/7/930", "18809644")
######

edge_list = edge_list.drop("variable", axis=1)
edge_list = edge_list.dropna()

# Sort list, drop duplicates and save
edge_list = edge_list.sort_values(by=['0'])
edge_list = edge_list.drop_duplicates()
edge_list.to_csv("edge_list.csv", index=False)