In [59]:
import numpy as np
import requests
from bs4 import BeautifulSoup

## Scirate

### Obtain paper list from a single page (for one day)

In [87]:
def get_papers_scirate(url='https://scirate.com/arxiv/quant-ph?date=2018-04-11&range=1'):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    paper_list = soup.find_all('li', class_='paper tex2jax')
    
    papers = []
    for i, paper in enumerate(paper_list):
        title = paper.find('div', class_='title').text
        author_list = paper.find('div', class_='authors').text.split(', ')
        authors = []
        for author in author_list:
            authors.append(author.strip())
        paper_info = {
            'title': title,
            'authors': authors,
            'rank': i
        }
        papers.append(paper_info)
    
    return papers

In [96]:
papers_scirate = get_papers_scirate()

In [97]:
print('The number of papers: ', len(papers_scirate))
print('The first paper title: ', papers_scirate[0]['title'])
print('The first paper authors: ', papers_scirate[0]['authors'])
print('The first paper rank: ', papers_scirate[0]['rank'])

The number of papers:  25
The first paper title:  Strawberry Fields: A Software Platform for Photonic Quantum Computing
The first paper authors:  ['Nathan Killoran', 'Josh Izaac', 'Nicolás Quesada', 'Ville Bergholm', 'Matthew Amy', 'Christian Weedbrook']
The first paper rank:  0


## Arxiv

### Obtain paper list from a single page (for one day)

In [99]:
def get_papers_arxiv(url_start='https://arxiv.org/catchup?action=/catchup'):
    form_info = {
        'archive': 'quant-ph',
        'sday': '11',
        'smonth': '04',
        'syear': '2018',
        'method': 'without'
    }
    response = requests.post(url_start, form_info)
    
    soup = BeautifulSoup(response.text, 'lxml')
    paper_list = soup.find_all('dd')
    meta_list = soup.find_all('dt')
    
    papers = []
    for i, (paper, meta) in enumerate(zip(paper_list, meta_list)):
        meta_info = meta.find('span', class_='list-identifier')
        if 'replaced' in meta_info.text or 'cross-list' in meta_info.text:
            continue

        title = paper.find('div', class_='list-title mathjax').text.split('\nTitle: ')[1].split('\n')[0]
        author_list = np.array(paper.find('div', class_='list-authors').text.split('\n'))[2:-1]
        authors = []
        for author in author_list:
            authors.append(author.split(',')[0])
        paper_info = {
            'title': title,
            'authors': authors,
            'order': i
        }
        papers.append(paper_info)
    
    return papers

In [100]:
papers_arxiv = get_papers_arxiv()

In [101]:
print('The number of papers: ', len(papers_arxiv))
print('The first paper title: ', papers_arxiv[0]['title'])
print('The first paper authors: ', papers_arxiv[0]['authors'])
print('The first paper order: ', papers_arxiv[0]['order'])

The number of papers:  25
The first paper title:  Strawberry Fields: A Software Platform for Photonic Quantum Computing
The first paper authors:  ['Nathan Killoran', 'Josh Izaac', 'Nicolás Quesada', 'Ville Bergholm', 'Matthew Amy', 'Christian Weedbrook']
The first paper order:  0


In [102]:
# Query papers for a specific period
# TODO: automatic form_info formation and request

url = 'https://arxiv.org/catchup?action=/catchup'
form_info = {
    'archive': 'quant-ph',
    'sday': '11',
    'smonth': '04',
    'syear': '2018',
    'method': 'without'
}
response = requests.post(url, form_info)

In [38]:
soup = BeautifulSoup(response.text, 'lxml')