In [1]:
import json
import re

from bs4 import BeautifulSoup
import requests
import tqdm

In [2]:
num_pages = 4546  # As of March 23, 2019

articles = list()
for index in tqdm.tnrange(num_pages):
    url = f'https://www.biorxiv.org/content/early/recent?page={index}'
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    page_articles = soup.find_all('div', {'class': 'highwire-article-citation'})
    
    for article_html in page_articles:
        article_dict = dict()
        
        article_dict['url'] = article_html.find(
            'span', {'class': 'highwire-cite-metadata-doi'}
        ).contents[1].strip()
        
        # Date is in format like "YYYY/MM/DD"
        article_dict['date'] = re.search('(?<=/)[0-9]{4}/[0-9]{2}/[0-9]{2}', 
                                         article_html.attrs['data-apath']).group()

        authors_html = article_html.find_all('span', {'class': 'highwire-citation-author'})
        for author in authors_html:
            # Don't include collaborations or consortia as authors
            if author.find('span', {'class': 'nlm-collab'}):
                continue
            first = author.find('span', {'class': 'nlm-given-names'})
            first = first.text if first else ""  # Some authors have only a surname
            last = author.find('span', {'class': 'nlm-surname'}).text
            article_dict.setdefault('authors', list()).append((last, first))
        
        # Only add articles that have non-collaboration authors
        if article_dict.get('authors'):
            articles.append(article_dict)
        
with open('articles.json', 'w') as f:
    json.dump(articles, f, indent=2)

HBox(children=(IntProgress(value=0, max=4546), HTML(value='')))


