In [1]:
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Manager, Pool

In [2]:
chunks = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=50000)

In [3]:
def parse_authors(author_str):
    authors = []
    # remove noises
    author_str = author_str.replace(" ", "")
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(',')
    for a in parse_by_comma:
        # parse by 'and'
        authors.extend(a.split(' and '))
    return authors

def parse_categories(cat_str):
    return cat_str

def parse_year(date_str):
    return date_str.split('-')[0]

def count_name_frequencies(name_list2d):
    # Flatten the list of lists into a single list of names
    all_names = [name for sublist in name_list2d for name in sublist]
    # Use Counter to count the frequency of each name
    name_frequencies = Counter(all_names)
    return dict(name_frequencies)

def merge_add_dict(a, b):
    return {key: a.get(key, 0) + b.get(key, 0) for key in set(a) | set(b)}

def gather_stats(df):
    intermediate_submitter_stats = dict(df['submitter'].value_counts())
    intermediate_authors_stats = count_name_frequencies(list(map(parse_authors, df['authors'])))
    # intermediate_authors_stats = merge_add_dict(intermediate_submitter_stats, intermediate_authors_stats)
    intermediate_cat_stats = dict(df['categories'].value_counts())
    intermediate_jou_stats = dict(df['journal-ref'].value_counts())
    # intermediate_year_stats = dict(df['update_date'].value_counts())
    return intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats

In [4]:
# df['journal-ref']

In [5]:
# manager = Manager()

# author_dict = manager.dict()
# cat_dict = manager.dict()
# year_dict = manager.dict()
author_dict = {}
cat_dict = {}
journal_dcit = {}

counter = 0
for chunk in chunks:
    df = chunk
    intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats = gather_stats(df)
    author_dict = merge_add_dict(author_dict, intermediate_authors_stats)
    cat_dict = merge_add_dict(cat_dict, intermediate_cat_stats)
    journal_dcit = merge_add_dict(journal_dcit, intermediate_jou_stats)
    
    counter += len(df)
    print("processed:", counter)
    print("aut stats => ", len(author_dict))
    print("cat stats => ", len(cat_dict))
    print()
    # break

processed: 50000
aut stats =>  100892
cat stats =>  3869

processed: 100000
aut stats =>  172824
cat stats =>  6070

processed: 150000
aut stats =>  236791
cat stats =>  8414

processed: 200000
aut stats =>  297452
cat stats =>  11079

processed: 250000
aut stats =>  354072
cat stats =>  13849

processed: 300000
aut stats =>  408798
cat stats =>  16471

processed: 350000
aut stats =>  462450
cat stats =>  18807

processed: 400000
aut stats =>  514831
cat stats =>  21159

processed: 450000
aut stats =>  569994
cat stats =>  23098

processed: 500000
aut stats =>  622476
cat stats =>  24888

processed: 550000
aut stats =>  673833
cat stats =>  26567

processed: 600000
aut stats =>  728148
cat stats =>  28306

processed: 650000
aut stats =>  778090
cat stats =>  30000

processed: 700000
aut stats =>  828319
cat stats =>  31605

processed: 750000
aut stats =>  878433
cat stats =>  33179

processed: 800000
aut stats =>  929850
cat stats =>  34726

processed: 850000
aut stats =>  980074
cat s

In [7]:
def sort_dict(dict_data, byval=True, reverse=True):
    if byval: idx = 1
    else: idx = 0
    return dict(sorted(dict_data.items(), key=lambda item: item[idx], reverse=reverse))

In [8]:
author_dict = sort_dict(author_dict)
cat_dict = sort_dict(cat_dict)
journal_dcit = sort_dict(journal_dcit)

In [43]:
upto = 1000
counter = 0
for k, v in author_dict.items():
    # if len(k)>30: continue
    counter+=1
    print("{:20} : {}".format(k.replace("\n", ""), v))
    if counter==upto: break

2)                   : 7347
Italy                : 4453
USA                  : 4422
Germany)             : 4030
Germany              : 4022
3)                   : 3600
Italy)               : 3262
USA)                 : 3132
etal                 : 3087
Y.Zhang              : 2434
Spain                : 2390
France)              : 2324
Russia)              : 2297
France               : 2226
UK)                  : 2131
4)                   : 1971
Y.Wang               : 1703
India)               : 1664
UK                   : 1633
J.Wang               : 1564
Moscow               : 1555
Y.Li                 : 1469
2                    : 1435
Russia               : 1389
Z.Wang               : 1346
L.Zhang              : 1325
Y.Gao                : 1297
Spain)               : 1294
X.Liu                : 1266
TakashiTaniguchi     : 1245
YangLiu              : 1243
KenjiWatanabe        : 1230
CMSCollaboration     : 1209
Japan                : 1179
5)                   : 1155
ATLASCollaboration  