In [4]:
from Bio import Entrez
import pandas as pd
import numpy as np
from tqdm import trange
from datetime import datetime as dt

In [5]:
def search(query, retmax, mindate=None, maxdate=None, reldate=None):
    Entrez.email = '#####'
    Entrez.api_key = '#####'
    handle = Entrez.esearch(db='pubmed', 
                            retmax=retmax,
                            retmode='xml', 
                            term=query,
                            mindate=mindate,
                            maxdate=maxdate,
                            datetype='edat')
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = '#####'
    Entrez.api_key = '#####'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [6]:
def parse_article(article):
    # Empty dict for each article
    article_dict = {}
    
    # PMID
    article_dict['pmid'] = str(article['MedlineCitation']['PMID'])
    
    # Parse out the DOI, annoyingly it's mixed into PII fields and doesn't always seem to be there
    doi = np.nan
    
    for i in article['MedlineCitation']['Article']['ELocationID']:
        if i.attributes['EIdType'] == 'doi':
            doi = str(i)
        else:
            doi = np.nan
    
    article_dict['doi'] = doi
    
    # Title
    article_dict['title'] = article['MedlineCitation']['Article']['ArticleTitle']
    
    # Abstract
    try:
        article_dict['abstract'] = article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
    except:
        article_dict['abstract'] = np.nan
    
    # Article dates
    try: # Doesn't always seem to have a date
        article_date= '-'.join(list(article['MedlineCitation']['Article']['ArticleDate'][0].values()))
        article_dict['article_date'] = dt.strptime(article_date, "%Y-%m-%d")
    except:
        pass
    
    # Date available on pubmed
    for i in article['PubmedData']['History']:
        if i.attributes['PubStatus'] == 'pubmed':
            pubmed_date = '-'.join(list(i.values())[:3])
            article_dict['pubmed_date'] = dt.strptime(pubmed_date, "%Y-%m-%d")
    
    # Article type
    try:
        article_dict['article_type'] = str(article['MedlineCitation']['Article']['PublicationTypeList'][0])
    except:
        pass
    
    # Article language
    try:
        article_dict['lang'] = article['MedlineCitation']['Article']['Language'][0]
    except:
        pass
    
    # Long form journal
    try:
        article_dict['journal'] = article['MedlineCitation']['Article']['Journal']['Title']
    except:
        pass
    
    # ISO Journal abbreviation
    try:
        article_dict['journal_short'] = article['MedlineCitation']['Article']['Journal']['ISOAbbreviation']
    except:
        pass
    
    # Journal country
    try:
        article_dict['journal_country'] = article['MedlineCitation']['MedlineJournalInfo']['Country']
    except:
        pass
    
    # Authors
    authors = []
    try: # Sometimes there aren't proper authors listed
        for author in article['MedlineCitation']['Article']['AuthorList']:
            authors.append(author['LastName'] + ' ' + author['ForeName'])
    except:
        authors = np.nan
    
    article_dict['authors'] = authors
    
    # Affiliations
    affils = []
    try:
        for author in article['MedlineCitation']['Article']['AuthorList']:
            affils.append(author['AffiliationInfo'][0]['Affiliation'])
    except:
        affils = np.nan
    
    article_dict['author_affils'] = affils
    
    # Article keywords
    try:
        article_dict['keywords'] = [str(i) for i in (article['MedlineCitation']['KeywordList'][0])]
    except:
        article_dict['keywords'] = np.nan
        

    # Article Mesh terms    
    mesh_terms = []
    try: # Not always mesh terms
        for i in article['MedlineCitation']['MeshHeadingList']:
            mesh_terms.append(str(i['DescriptorName']))
    except Exception as e:
        mesh_terms = np.nan
        
    article_dict['mesh_terms'] = mesh_terms
    
    # References (if included)
    references_pmids = []
    try: # References not always included
        for i in article['PubmedData']['ReferenceList'][0]['Reference']:
            references_pmids.append(str(i['ArticleIdList'][0]))
    except:
        references_pmids = np.nan
    
    article_dict['references_pmids'] = references_pmids
    
    return article_dict

In [7]:
def retrieve_articles(search_term, retmax, chunk_size = 50, mindate=None, maxdate=None):
    
    # Fetch a list of PMIDs from the search term
    result = search(search_term, retmax=retmax, mindate=mindate, maxdate=maxdate)
    id_list = result['IdList']
    
    search_time = dt.now()
    
    print(f"List of {len(id_list)} PMIDs retrieved of {result['Count']} results.")
    print("Downloading and parsing:")
    
    paper_list = []
    
    # Retrieve in chunks
    for chunk_i in trange(0, len(id_list), chunk_size):
        chunk = id_list[chunk_i:chunk_i + chunk_size]
    
        papers = fetch_details(chunk)
        
        for i, paper in enumerate(papers['PubmedArticle']):
            paper_list.append(parse_article(paper))
            
    df = pd.DataFrame(paper_list)
    
    df['pmid'] = df.pmid.astype(int)
    
    most_recent_date = df.pubmed_date.max()
                    
    return (df, search_time, most_recent_date)

In [12]:
search_term = """((((((((["artificial intelligence") OR ("deep learning")) OR ("machine learning")) OR ("neural net")) OR ("transfer learning")) OR ("supervised learning")) OR (unsupervised learning)) ) OR (artificial intelligence[MeSH Terms])"""

In [16]:
article_df, search_time, most_recent_article_date = retrieve_articles(search_term = search_term,
                                                                           mindate = 1998,
                                                                           maxdate = 2021,
                                                                           retmax = 200000,
                                                                           chunk_size=50)

  0%|                                                                                         | 0/3232 [00:00<?, ?it/s]

List of 161581 PMIDs retrieved of 161581 results.
Downloading and parsing:


100%|████████████████████████████████████████████████████████████████████████████| 3232/3232 [1:21:53<00:00,  1.52s/it]


In [18]:
article_df.tail(5)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids
161534,9417639,,Temporal coding in neural populations?,,NaT,1998-01-07,Journal Article,eng,"Science (New York, N.Y.)",Science,United States,[Fetz E E],"[Department of Physiology, University of Washi...",,"[Action Potentials, Algorithms, Animals, Cell ...",
161535,9417241,,[Computer-assisted documentation of brain deat...,,NaT,1998-01-07,Journal Article,ger,"Anasthesiologie, Intensivmedizin, Notfallmediz...",Anasthesiol Intensivmed Notfallmed Schmerzther,Germany,"[Baumgartner A, Schwarz G, Grims R, Pfurtschel...",,,"[Artificial Intelligence, Brain, Brain Death, ...",
161536,9416755,,An algorithm for quantifying blood pressure la...,,NaT,1998-01-07,Comment,eng,Anesthesiology,Anesthesiology,United States,[Cohn A I],,,"[Algorithms, Anesthesiology, Blood Pressure, E...",
161537,9407227,,Toward robot-assisted vascular microsurgery in...,Experimental protocol in our laboratory routin...,NaT,1998-01-04,Journal Article,eng,Graefe's archive for clinical and experimental...,Graefes Arch Clin Exp Ophthalmol,Germany,"[Jensen P S, Grace K W, Attariwala R, Colgate ...",,,"[Animals, Cats, Equipment Design, Microsurgery...","[8125712, 8406066, 1783011, 3609762, 8361399]"
161538,21244956,,Statistical inference: learning in artificial ...,Artificial neural networks (ANNs) are widely u...,NaT,1998-01-01,Journal Article,eng,Trends in cognitive sciences,Trends Cogn Sci,England,"[Hua Yang H, Murata N, Amari S]",,,,


In [19]:
#article_df.to_csv("data/scraped_raw.csv")