In [1]:
import itertools
import json
import re

import numpy as np
import pandas as pd

In [2]:
with open('articles.json', 'r') as f:
    articles = json.load(f)

In [3]:
df = (
    pd.DataFrame
    .from_records(articles)
    .assign(
        year=lambda df: df['date'].apply(lambda x: x.split('/')[0]),
        month=lambda df: df['date'].apply(lambda x: x.split('/')[1]),
        doi=lambda df: df['url'].apply(lambda x: re.search('10.1101/[0-9]+', x).group())
    )
    .filter(items=['doi', 'authors', 'year', 'month', 'url'])
)

df.head(2)

Unnamed: 0,doi,authors,year,month,url
0,10.1101/585034,"[[Tuszynski, Jack Adam], [Satarić, Miljko], [N...",2019,3,https://doi.org/10.1101/585034
1,10.1101/586305,"[[Grey, Michael J], [Cloots, Eva], [Simpson, M...",2019,3,https://doi.org/10.1101/586305


In [4]:
authors = set()
for paper_authors in df['authors'].values.tolist():
    for author in paper_authors:
        authors.add(tuple(author))
authors = sorted(authors)

author_mapping = {k: v for k, v in zip(authors, range(len(authors)))}
df['mapped_authors'] = df['authors'].apply(lambda x: list(map(author_mapping.get, map(tuple, x))))

df.head(2)

Unnamed: 0,doi,authors,year,month,url,mapped_authors
0,10.1101/585034,"[[Tuszynski, Jack Adam], [Satarić, Miljko], [N...",2019,3,https://doi.org/10.1101/585034,"[188713, 163269, 132685, 167153]"
1,10.1101/586305,"[[Grey, Michael J], [Cloots, Eva], [Simpson, M...",2019,3,https://doi.org/10.1101/586305,"[66515, 34020, 171931, 102857, 167621, 40868, ..."


In [5]:
edges = (
    df
    .apply(lambda row: 
           [pair + (int(row['year']),) for pair in itertools.product(
               row['mapped_authors'], row['mapped_authors']
           )],
           axis=1)
    .values
)
edges = (
    pd.DataFrame(
        [rel for paper_list in edges for rel in paper_list], 
        columns=['id_a', 'id_b', 'year']
    )
    .query('id_a != id_b')  # itertools.product includes self relationships
    .reset_index(drop=True)
)
edges.to_csv('biorxiv_coauthor.tsv.xz', sep='\t', index=False, compression='xz')
edges.head()

Unnamed: 0,id_a,id_b,year
0,188713,163269,2019
1,188713,132685,2019
2,188713,167153,2019
3,163269,188713,2019
4,163269,132685,2019
