In [40]:
%load_ext autoreload
%autoreload 2

import os
import re
from glob import glob
import json
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

import matplotlib.pyplot as plt
import seaborn as sns

## Data Acquisition

In [6]:
arxiv_files = sorted(glob('../data/arxiv/*'))
scirate_files = sorted(glob('../data/scirate/*'))

In [7]:
arxiv_data = []
for file in arxiv_files:
    with open(file, 'r') as f:
        arxiv_data.append(json.load(f))
        
print(len(arxiv_data))

scirate_data = []
for file in scirate_files:
    with open(file, 'r') as f:
        scirate_data.append(json.load(f))
        
print(len(scirate_data))

62
62


In [43]:
arxiv_data[-1]['date']

'day_2018_03_30'

In [41]:
# 2018-03-30 Arxiv top
arxiv_data[-1]['papers'][0]

{'abstract': 'Abstract: Out-of-time-order correlation (OTOC) functions provide a powerful theoreticaltool for diagnosing chaos and the scrambling of information instrongly-interacting, quantum systems. However, their direct and unambiguousexperimental measurement remains an essential challenge. At its core, thischallenge arises from the fact that the effects of both decoherence andexperimental noise can mimic that of information scrambling, leading to decayof OTOCs. Here, we analyze a quantum teleportation protocol that explicitlyenables one to differentiate between scrambling and decoherence. Moreover, wedemonstrate that within this protocol, one can extract a precise "noise"parameter which quantitatively captures the non-scrambling induced decay ofOTOCs. Using this parameter, we prove explicit bounds on the true value of theOTOC. Our results open the door to experimentally measuring quantum scramblingwith built-in verifiability.',
 'authors': ['Beni Yoshida', 'Norman Y. Yao'],
 'num_

In [42]:
# 2018-03-30 Scirate top
scirate_data[-1]['papers'][0]

{'authors': ['Beni Yoshida', 'Norman Y. Yao'],
 'rank': 0,
 'scite_count': 34,
 'title': 'Disentangling Scrambling and Decoherence via Quantum Teleportation'}

## EDA

Entry ID: paper name (DOI?)
We can create an arbitrary paper id that corresponds to each paper title, authors, and DOI.

Possible features:

- Arxiv order
- Scirate order
- Paper length (pages)
- Title length (words)
- Number of authors
- Total # of citations of the authors (or first author? last author?)
- Bag of Words of title
- Bag of Words of abstract

In [11]:
# obtain features from both Arxiv and Scirate paper lists

index = []
title = []
authors = []
num_authors = []
title_length = []
arxiv_order = []
submit_time = []
submit_weekday = []
paper_size = []
num_versions = []

for res in arxiv_data:
    date = res['date']
    papers = res['papers']
    for paper in papers:
        # create arbitrary paper id - currently, it is "date + Arxiv order"
        if paper['order'] < 10:
            idx = '_000' + str(paper['order'])
        elif 10 <= paper['order'] < 100:
            idx = '_00' + str(paper['order'])
        elif 100 <= paper['order'] < 1000:
            idx = '_0' + str(paper['order'])
        else:
            idx = '_' + str(paper['order'])
        index.append(date + idx)
        
        title.append(paper['title'])
        authors.append(paper['authors'])
        num_authors.append(len(paper['authors']))
        title_length.append(len(paper['title']))
        arxiv_order.append(paper['order'])
        submit_time.append(paper['submit_time'])
        submit_weekday.append(paper['submit_weekday'])
        paper_size.append(int(re.findall('\d+', paper['size'])[0]))
        num_versions.append(paper['num_versions'])

In [13]:
len(index)

1727

In [14]:
# Scirate rank - string matching to find index of each paper in Arxiv list
### This process is pretty slow - needs to be refactored ###

scirate_rank = [-1 for _ in range(len(index))]
scite_score = [-1 for _ in range(len(index))]

for res in scirate_data:
    papers = res['papers']
    for paper in papers:
        title_sci = paper['title']
        try:
            idx = title.index(title_sci)
        except:
            # if there is no just match, use difflib SequenceMatcher for title matching
            str_match = np.array([SequenceMatcher(a=title_sci, b=title_arx).ratio() for title_arx in title])
            idx = np.argmax(str_match)
        scirate_rank[idx] = paper['rank']
        scite_score[idx] = paper['scite_count']

In [15]:
# columns for pandas DataFrame
columns = ['title', 'authors', 'num_authors', 'title_length', 'arxiv_order', 'submit_time', 'submit_weekday',
           'paper_size', 'num_versions', 'scirate_rank', 'scite_score']

In [17]:
# this is too dirty...
title = np.array(title).reshape(-1, 1)
authors = np.array(authors).reshape(-1, 1)
num_authors = np.array(num_authors).reshape(-1, 1)
title_length = np.array(title_length).reshape(-1, 1)
arxiv_order = np.array(arxiv_order).reshape(-1, 1)
submit_time = np.array(submit_time).reshape(-1, 1)
submit_weekday = np.array(submit_weekday).reshape(-1, 1)
paper_size = np.array(paper_size).reshape(-1, 1)
num_versions = np.array(num_versions).reshape(-1, 1)
scirate_rank = np.array(scirate_rank).reshape(-1, 1)
scite_score = np.array(scite_score).reshape(-1, 1)

data = np.concatenate([
    title,
    authors,
    num_authors,
    title_length,
    arxiv_order,
    submit_time,
    submit_weekday,
    paper_size,
    num_versions,
    scirate_rank,
    scite_score
], axis=1)

df = pd.DataFrame(data=data, columns=columns, index=index)

In [28]:
len(df)

1727

In [18]:
df.head()

Unnamed: 0,title,authors,num_authors,title_length,arxiv_order,submit_time,submit_weekday,paper_size,num_versions,scirate_rank,scite_score
day_2018_01_01_0000,Harvesting Entanglement from the Black Hole Va...,"[Laura J. Henderson, Robie A. Hennigar, Robert...",5,50,0,19:00:00,Thu,118,1,7,3
day_2018_01_01_0001,Suppression of heating in quantum spin cluster...,"[Kai Ji, Boris V. Fine]",2,104,1,19:10:19,Thu,238,1,23,0
day_2018_01_01_0002,Simulating boson sampling in lossy architectures,"[Raúl García-Patrón, Jelmer J. Renema, Valery ...",3,48,2,19:52:42,Thu,1107,1,1,16
day_2018_01_01_0003,Local Casimir Effect for a Scalar Field in Pre...,"[Davide Fermi (Universita' di Milano), Livio P...",2,71,3,19:58:14,Thu,125,1,22,0
day_2018_01_01_0004,Emerging Connections: Quantum and Classical Op...,"[Xiao-Feng Qian, A. Nick Vamivakas, Joseph H. ...",3,50,4,20:01:22,Thu,6095,1,21,0


In [26]:
df[['arxiv_order', 'scite_score', 'scirate_rank']].astype(float).corr(method='spearman')

Unnamed: 0,arxiv_order,scite_score,scirate_rank
arxiv_order,1.0,0.008668,-0.042569
scite_score,0.008668,1.0,-0.810787
scirate_rank,-0.042569,-0.810787,1.0
