In [7]:
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Manager, Pool

In [17]:
chunks = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=50000)

In [24]:
def parse_authors(author_str):
    authors = []
    # remove noises
    author_str = author_str.replace(" ", "")
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(',')
    for a in parse_by_comma:
        # parse by 'and'
        authors.extend(a.split(' and '))
    return authors

def parse_categories(cat_str):
    return cat_str

def parse_year(date_str):
    return date_str.split('-')[0]

def count_name_frequencies(name_list2d):
    # Flatten the list of lists into a single list of names
    all_names = [name for sublist in name_list2d for name in sublist]
    # Use Counter to count the frequency of each name
    name_frequencies = Counter(all_names)
    return dict(name_frequencies)

def merge_add_dict(a, b):
    return {key: a.get(key, 0) + b.get(key, 0) for key in set(a) | set(b)}

def gather_stats(df):
    intermediate_submitter_stats = dict(df['submitter'].value_counts())
    intermediate_authors_stats = count_name_frequencies(list(map(parse_authors, df['authors'])))
    intermediate_authors_stats = merge_add_dict(intermediate_submitter_stats, intermediate_authors_stats)
    intermediate_cat_stats = dict(df['categories'].value_counts())
    # intermediate_year_stats = dict(df['update_date'].value_counts())
    return intermediate_authors_stats, intermediate_cat_stats

In [27]:
# manager = Manager()

# author_dict = manager.dict()
# cat_dict = manager.dict()
# year_dict = manager.dict()
author_dict = {}
cat_dict = {}

counter = 0
for chunk in chunks:
    df = chunk
    intermediate_authors_stats, intermediate_cat_stats = gather_stats(df)
    author_dict = merge_add_dict(author_dict, intermediate_authors_stats)
    cat_dict = merge_add_dict(cat_dict, intermediate_cat_stats)
    
    counter += len(df)
    print("processed:", counter)
    print("aut stats => ", len(author_dict))
    print("cat stats => ", len(cat_dict))
    print()
    # break

processed: 50000
aut stats =>  201427
cat stats =>  7418

processed: 100000
aut stats =>  345530
cat stats =>  11724

processed: 150000
aut stats =>  466422
cat stats =>  15157

processed: 200000
aut stats =>  572660
cat stats =>  18239

processed: 250000
aut stats =>  671907
cat stats =>  21121

processed: 300000
aut stats =>  763515
cat stats =>  23759

processed: 350000
aut stats =>  846648
cat stats =>  26248

processed: 400000
aut stats =>  926592
cat stats =>  28557

processed: 450000
aut stats =>  1003872
cat stats =>  30792

processed: 500000
aut stats =>  1078814
cat stats =>  32852

processed: 550000
aut stats =>  1150241
cat stats =>  34922

processed: 600000
aut stats =>  1227359
cat stats =>  36727

processed: 650000
aut stats =>  1321304
cat stats =>  36963

processed: 700000
aut stats =>  1394958
cat stats =>  37836

processed: 750000
aut stats =>  1452426
cat stats =>  38753

processed: 800000
aut stats =>  1503576
cat stats =>  39743

processed: 850000
aut stats =>  15

In [39]:
def sort_dict(dict_data, byval=True, reverse=True):
    if byval: idx = 1
    else: idx = 0
    return dict(sorted(dict_data.items(), key=lambda item: item[idx], reverse=reverse))

In [43]:
# sort_dict(author_dict)

# sort_dict(cat_dict)

In [45]:
# # Convert dictionary to JSON formatted string
# json_data = json.dumps(author_dict, indent=4)

# # Writing JSON data to a file
# with open('../data/author_stats.json', 'w') as file:
#     file.write(json_data)