In [36]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from difflib import SequenceMatcher

from parsers.arxiv import scrape_arxiv
from parsers.scirate import scrape_scirate

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Acquisition

In [12]:
# parameters for crawling: search span, archive type, and method (with or without abstract)
params = {'start': {'year': 2018, 'month': 3, 'day': 1},
          'end': {'year': 2018, 'month': 4, 'day': 1},
          'archive': 'quant-ph',
          'method': 'without'
          }

In [13]:
# crawl/scrape Arxiv
results_arxiv = scrape_arxiv(params)

In [14]:
# crawl/scrape Scirate
results_scirate = scrape_scirate(params)

In [17]:
# 2018-03-01 Arxiv top
results_arxiv[0]['papers'][0]

{'authors': ['Kirill P. Kalinin', 'Natalia G. Berloff'],
 'order': 0,
 'title': 'Blockchain platform with proof-of-work based on analog Hamiltonian  optimisers'}

In [18]:
# 2018-03-01 Scirate top
results_scirate[0]['papers'][0]

{'authors': ['M. B. Hastings'],
 'rank': 0,
 'title': 'A Short Path Quantum Algorithm for Exact Optimization'}

In [19]:
# 2018-03-30 Arxiv top
results_arxiv[-1]['papers'][0]

{'authors': ['Beni Yoshida', 'Norman Y. Yao'],
 'order': 0,
 'title': 'Disentangling Scrambling and Decoherence via Quantum Teleportation'}

In [21]:
# 2018-03-30 Scirate top
results_scirate[-1]['papers'][0]

{'authors': ['Beni Yoshida', 'Norman Y. Yao'],
 'rank': 0,
 'title': 'Disentangling Scrambling and Decoherence via Quantum Teleportation'}

## EDA

Entry ID: paper name (DOI?)
We can create an arbitrary paper id that corresponds to each paper title, authors, and DOI.

Possible features:

- Arxiv order
- Scirate order
- Paper length (pages)
- Title length (words)
- Number of authors
- Total # of citations of the authors (or first author? last author?)
- Bag of Words of title
- Bag of Words of abstract

In [31]:
index = []
title = []
authors = []
num_authors = []
title_length = []
arxiv_order = []

for res in results_arxiv:
    date = res['date']
    papers = res['papers']
    for paper in papers:
        if paper['order'] < 10:
            idx = '_000' + str(paper['order'])
        elif 10 <= paper['order'] < 100:
            idx = '_00' + str(paper['order'])
        elif 100 <= paper['order'] < 1000:
            idx = '_0' + str(paper['order'])
        else:
            idx = '_' + str(paper['order'])
        index.append(date + idx)
        
        title.append(paper['title'])
        authors.append(paper['authors'])
        num_authors.append(len(paper['authors']))
        title_length.append(len(paper['title']))
        arxiv_order.append(paper['order'])
        

In [40]:
# This process is pretty slow - needs to be refactored
scirate_rank = [-1 for _ in range(len(arxiv_order))]

for res in results_scirate:
    papers = res['papers']
    for paper in papers:
        title_sci = paper['title']
        try:
            idx = title.index(title_sci)
        except:
            # if there is no just match, use difflib SequenceMatcher for title matching
            str_match = np.array([SequenceMatcher(a=title_sci, b=title_arx).ratio() for title_arx in title])
            idx = np.argmax(str_match)
        scirate_rank[idx] = paper['rank']

In [47]:
columns = ['title', 'authors', 'num_authors', 'title_length', 'arxiv_order', 'scirate_rank']

In [74]:
title = np.array(title).reshape(-1, 1)
authors = np.array(authors).reshape(-1, 1)
num_authors = np.array(num_authors).reshape(-1, 1)
title_length = np.array(title_length).reshape(-1, 1)
arxiv_order = np.array(arxiv_order).reshape(-1, 1)
scirate_rank = np.array(scirate_rank).reshape(-1, 1)
data = np.concatenate([
    title,
    authors,
    num_authors,
    title_length,
    arxiv_order,
    scirate_rank
], axis=1)
df = pd.DataFrame(data=data, columns=columns, index=index)

In [75]:
df

Unnamed: 0,title,authors,num_authors,title_length,arxiv_order,scirate_rank
day_2018_03_01_0000,Blockchain platform with proof-of-work based o...,"[Kirill P. Kalinin, Natalia G. Berloff]",2,78,0,6
day_2018_03_01_0001,The Higgs Mechanism in Higher-Rank Symmetric $...,"[Daniel Bulmash, Maissam Barkeshli]",2,66,1,4
day_2018_03_01_0002,Qubit Parity Measurement by Parametric Driving...,"[Baptiste Royer, Shruti Puri, Alexandre Blais]",3,61,2,3
day_2018_03_01_0003,Hidden Variables and the Two Theorems of John ...,[N. David Mermin],1,50,3,1
day_2018_03_01_0004,A Short Path Quantum Algorithm for Exact Optim...,[M. B. Hastings],1,53,4,0
day_2018_03_01_0005,Hidden Variable Quantum Mechanics from Branchi...,[Don Weingarten],1,72,5,29
day_2018_03_01_0006,Time-dependent treatment of tunneling and Time...,[Shmuel Gurvitz],1,62,6,10
day_2018_03_01_0007,Time Reversal Invariance in Quantum Mechanics,[Reza Moulavi Ardakani],1,45,7,31
day_2018_03_01_0008,Measuring the similarity of input particle sta...,"[Su-Yong Lee, Jeongho Bang, Jaewan Kim]",3,72,8,28
day_2018_03_01_0009,Error Correction in Structured Optical Receivers,"[Alec M. Hammond, Ian W. Frank, Ryan M. Camacho]",3,48,9,27


In [73]:
data[0]

array(['Blockchain platform with proof-of-work based on analog Hamiltonian  optimisers',
       'The Higgs Mechanism in Higher-Rank Symmetric $U(1)$ Gauge Theories',
       'Qubit Parity Measurement by Parametric Driving in Circuit QED',
       'Hidden Variables and the Two Theorems of John Bell',
       'A Short Path Quantum Algorithm for Exact Optimization',
       'Hidden Variable Quantum Mechanics from Branching from Quantum Complexity'],
      dtype=object)