In [42]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Manager, Pool
from shared_funcs import gather_stats, merge_add_dict
from collections import Counter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
chunks = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=50000)

In [43]:
def parse_authors(author_str):
    authors = []
    # remove noises
    author_str = author_str.replace(" ", "")
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(',')
    for a in parse_by_comma:
        # parse by 'and'
        authors.extend(a.split(' and '))
    return authors

def parse_categories(cat_str):
    return cat_str

def parse_year(date_str):
    return date_str.split('-')[0]

def count_name_frequencies(name_list2d):
    # Flatten the list of lists into a single list of names
    all_names = [name for sublist in name_list2d for name in sublist]
    # Use Counter to count the frequency of each name
    name_frequencies = Counter(all_names)
    return dict(name_frequencies)

def merge_add_dict(a, b):
    return {key: a.get(key, 0) + b.get(key, 0) for key in set(a) | set(b)}

def gather_stats(df):
    intermediate_submitter_stats = dict(df['submitter'].value_counts())
    intermediate_authors_stats = count_name_frequencies(list(map(parse_authors, df['authors'])))
    # intermediate_authors_stats = merge_add_dict(intermediate_submitter_stats, intermediate_authors_stats)
    intermediate_cat_stats = dict(df['categories'].value_counts())
    intermediate_jou_stats = dict(df['journal-ref'].value_counts())
    intermediate_year_stats = dict(df['update_date'].value_counts())
    return intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats, intermediate_year_stats

In [44]:
# df['journal-ref']

In [46]:
# manager = Manager()

# author_dict = manager.dict()
# cat_dict = manager.dict()
# year_dict = manager.dict()
author_dict = {}
cat_dict = {}
journal_dict = {}
year_dict = {}

counter = 0
for chunk in chunks:
    df = chunk
    intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats, intermediate_year_stats = gather_stats(df)
    author_dict = merge_add_dict(author_dict, intermediate_authors_stats)
    cat_dict = merge_add_dict(cat_dict, intermediate_cat_stats)
    journal_dict = merge_add_dict(journal_dict, intermediate_jou_stats)
    year_dict = merge_add_dict(year_dict, intermediate_year_stats)
    
    
    counter += len(df)
    print("processed:", counter)
    print("aut stats => ", len(author_dict))
    print("cat stats => ", len(cat_dict))
    print("journal stats => ", len(journal_dict))
    print("year stats => ", len(year_dict))
    # break

processed: 50000
aut stats =>  126443
cat stats =>  6057
journal stats =>  18651
year stats =>  1968
processed: 100000
aut stats =>  213505
cat stats =>  9397
journal stats =>  37467
year stats =>  2310
processed: 150000
aut stats =>  290762
cat stats =>  12206
journal stats =>  55162
year stats =>  2485
processed: 200000
aut stats =>  356678
cat stats =>  14714
journal stats =>  73115
year stats =>  2585
processed: 250000
aut stats =>  419290
cat stats =>  16948
journal stats =>  90658
year stats =>  2660
processed: 300000
aut stats =>  478824
cat stats =>  19063
journal stats =>  107918
year stats =>  2706
processed: 350000
aut stats =>  537850
cat stats =>  21063
journal stats =>  124995
year stats =>  2723
processed: 400000
aut stats =>  594367
cat stats =>  23034
journal stats =>  141595
year stats =>  2739
processed: 450000
aut stats =>  652348
cat stats =>  25025
journal stats =>  157981
year stats =>  2754
processed: 500000
aut stats =>  708272
cat stats =>  27174
journal stats

In [48]:
def sort_dict(dict_data, byval=True, reverse=True):
    if byval: idx = 1
    else: idx = 0
    return dict(sorted(dict_data.items(), key=lambda item: item[idx], reverse=reverse))

In [49]:
author_dict = sort_dict(author_dict)
cat_dict = sort_dict(cat_dict)
journal_dict = sort_dict(journal_dict)
year_dict = sort_dict(year_dict)

In [None]:
with open('author.pickle', 'wb') as handle:
    pickle.dump(author_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('cat.pickle', 'wb') as handle:
    pickle.dump(cat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
upto = 1000
counter = 0
for k, v in author_dict.items():
    # if len(k)>30: continue
    counter+=1
    print("{:20} : {}".format(k.replace("\n", ""), v))
    if counter==upto: break