In [7]:
!pip install pandas numpy



In [8]:
import pandas as pd
import numpy as np

In [22]:
import io
import pandas as pd

training_data_paths = [
    "data/CLEF TAR Task 2/Training/DTA/qrels/full.train.dta.abs.2019.qrels",
    "data/CLEF TAR Task 2/Training/Intervention/qrels/full.train.int.abs.2019.qrels",
]

test_data_paths = [
    "data/CLEF TAR Task 2/Testing/DTA/qrels/full.test.dta.abs.2019.qrels",
    "data/CLEF TAR Task 2/Testing/Intervention/qrels/full.test.intervention.abs.2019.qrels",
]

def parse_qrels_file(file):
    try:
        qrels = pd.read_csv(
            file,
            sep="\s+",
            header=None,
            names=["topic_id", "idk", "PID", "relevance"],
        )

        del qrels["idk"]

        return qrels
    except Exception as e:
        print("Could not pares qrel file", e)
        return None

In [11]:
import requests
import xml.etree.ElementTree as ET

def fetch_articles(pids):
    payload = {'db': 'pubmed', 'id': pids, 'rettype': 'xml', 'retmode': 'xml'} 
    r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?', params=payload) 
    xml_data = r.content.decode('utf-8')
    
    return xml_data

def chunks(lst, n, start = 0):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        yield lst[i:i + n]
        
def xml_res_to_list(xml):
    articles = []
    context = ET.fromstring(xml)
    xml_articles = context.findall('PubmedArticle')
    
    for xml_article in xml_articles:
        pmid_elem = xml_article.find('.//PMID')
        title_elem = xml_article.find('.//ArticleTitle')
        abstract_elem = xml_article.find('.//Abstract/AbstractText')
        
        pmid = pmid_elem.text if pmid_elem is not None else None
        title = title_elem.text.strip() if title_elem is not None else None
        abstract = abstract_elem.text.strip() if abstract_elem is not None else None
 
        articles.append((pmid, title, abstract))
            
    return articles
                

In [2]:
import concurrent.futures


def write_chunk(pids, idx, total):
    articles_xml = fetch_articles(pids)
    articles = xml_res_to_list(articles_xml)

    df = pd.DataFrame(articles, columns =['PID', 'title', 'abstract'])
    df.to_csv(f'./data/articles/chunk_{idx}.csv', index=False)
    
    print(np.floor((idx +1 ) / total * 100), "%")
    
    return f'chunk_{idx}'

chunks_written = 0


def execute(chunks_written, pids_chunk, total_pids):
    res = write_chunk(pids_chunk, chunks_written, len(total_pids))
    
    return res

def fetch_articles(pids):
    gen = chunks(pids, 300)
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 
        for chunk in gen:
            future = executor.submit(execute, chunks_written, chunk, len(pids))
            concurrent.futures.as_completed(future)
            chunks_written += 1

In [4]:
training_data_paths = [
    "data/DATA2019/Training/DTA",
    "data/DATA2019/Training/Intervention",
]

test_data_paths = [
    "data/DATA2019/Testing/DTA",
    "data/DATA2019/Testing/Intervention",
]

In [20]:
import glob
import os
import pandas as pd 

articles = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "data/articles/chunk*.csv"))))

In [None]:
training_qrels = pd.concat([parse_qrels_file(p) for p in training_data_paths])
test_qrels = pd.concat([parse_qrels_file(p) for p in test_data_paths])

pids = pd.concat([training_qrels, test_qrels])['PID'].unique()
fetch_articles(pids)

pd.merge(left=training_qrels, right=articles, on="PID").to_csv('data/train.csv', index=False)
pd.merge(left=test_qrels, right=articles, on="PID").to_csv('data/test.csv', index=False)

array([ 7072537,  8748845,  3819738, ..., 12504236,  1872650, 18038549])

In [10]:
articles

Unnamed: 0,PID,title,abstract
0,8811504,"Somatostatin-like immunoreactivity, its molecu...",There is some evidence that Parkinson's diseas...
1,10064172,The effect of dehydroepiandrosterone sulfate a...,We measured cerebrospinal fluid (CSF) levels o...
2,3037978,Effects of phosphatidylserine on immunologic i...,
3,10973954,SERPIN regulation of factor XIa. The novel obs...,In the present studies we have made the novel ...
4,19056308,Levels of the light subunit of neurofilament t...,Neurofilaments are major structural elements o...
...,...,...,...
295,13130173,[Markers of bone formation and resorption in p...,Biochemical bone markers are a valuable noninv...
296,12525884,Association of an interleukin-1beta gene polym...,Inflammation is thought to promote neuronal ce...
297,15218778,[Coexistence of Alzheimer's disease with pseud...,Alzheimer disease is a type of cerebral amyloi...
298,10201277,Recent advances in dementia research in Japan:...,"In a previous article, recent reports by Japan..."
