In [117]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Manager, Pool
from shared_funcs import gather_stats, merge_add_dict
from collections import Counter
import sys
sys.path.append("../")
from utils.parse_arxiv import filter_df_with_pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
chunks = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=50000)

In [43]:
def parse_authors(author_str):
    authors = []
    # remove noises
    author_str = author_str.replace(" ", "")
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(',')
    for a in parse_by_comma:
        # parse by 'and'
        authors.extend(a.split(' and '))
    return authors

def parse_categories(cat_str):
    return cat_str

def parse_year(date_str):
    return date_str.split('-')[0]

def count_name_frequencies(name_list2d):
    # Flatten the list of lists into a single list of names
    all_names = [name for sublist in name_list2d for name in sublist]
    # Use Counter to count the frequency of each name
    name_frequencies = Counter(all_names)
    return dict(name_frequencies)

def merge_add_dict(a, b):
    return {key: a.get(key, 0) + b.get(key, 0) for key in set(a) | set(b)}

def gather_stats(df):
    intermediate_submitter_stats = dict(df['submitter'].value_counts())
    intermediate_authors_stats = count_name_frequencies(list(map(parse_authors, df['authors'])))
    # intermediate_authors_stats = merge_add_dict(intermediate_submitter_stats, intermediate_authors_stats)
    intermediate_cat_stats = dict(df['categories'].value_counts())
    intermediate_jou_stats = dict(df['journal-ref'].value_counts())
    intermediate_year_stats = dict(df['update_date'].value_counts())
    return intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats, intermediate_year_stats

In [44]:
# df['journal-ref']

In [97]:
# manager = Manager()

# author_dict = manager.dict()
# cat_dict = manager.dict()
# year_dict = manager.dict()
author_dict = {}
cat_dict = {}
journal_dict = {}
year_dict = {}

counter = 0
for chunk in chunks:
    df = chunk
    intermediate_authors_stats, intermediate_cat_stats, intermediate_jou_stats, intermediate_year_stats = gather_stats(df)
    author_dict = merge_add_dict(author_dict, intermediate_authors_stats)
    cat_dict = merge_add_dict(cat_dict, intermediate_cat_stats)
    journal_dict = merge_add_dict(journal_dict, intermediate_jou_stats)
    year_dict = merge_add_dict(year_dict, intermediate_year_stats)
    
    
    counter += len(df)
    print("processed:", counter)
    print("aut stats => ", len(author_dict))
    print("cat stats => ", len(cat_dict))
    print("journal stats => ", len(journal_dict))
    print("year stats => ", len(year_dict))
    # break

processed: 50000
aut stats =>  100892
cat stats =>  3869
journal stats =>  25312
year stats =>  2639
processed: 100000
aut stats =>  172824
cat stats =>  6070
journal stats =>  50140
year stats =>  3260
processed: 150000
aut stats =>  236791
cat stats =>  8414
journal stats =>  75579
year stats =>  3637
processed: 200000
aut stats =>  297452
cat stats =>  11079
journal stats =>  99626
year stats =>  3871
processed: 250000
aut stats =>  354072
cat stats =>  13849
journal stats =>  122164
year stats =>  4050
processed: 300000
aut stats =>  408798
cat stats =>  16471
journal stats =>  141753
year stats =>  4197
processed: 350000
aut stats =>  462450
cat stats =>  18807
journal stats =>  160487
year stats =>  4317
processed: 400000
aut stats =>  514831
cat stats =>  21159
journal stats =>  179132
year stats =>  4413
processed: 450000
aut stats =>  569994
cat stats =>  23098
journal stats =>  197556
year stats =>  4491
processed: 500000
aut stats =>  622476
cat stats =>  24888
journal stats

In [48]:
def sort_dict(dict_data, byval=True, reverse=True):
    if byval: idx = 1
    else: idx = 0
    return dict(sorted(dict_data.items(), key=lambda item: item[idx], reverse=reverse))

In [75]:
author_dict = sort_dict(author_dict)
cat_dict = sort_dict(cat_dict)
journal_dict = sort_dict(journal_dict)
year_dict = sort_dict(year_dict)

In [52]:
with open('author.pickle', 'wb') as handle:
    pickle.dump(author_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('cat.pickle', 'wb') as handle:
    pickle.dump(cat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('journal.pickle', 'wb') as handle:
    pickle.dump(journal_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('date.pickle', 'wb') as handle:
    pickle.dump(year_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Filter data for all metadata frequencies > 1

In [150]:
chunks = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=50000)
filtered_df = None
for i, chunk in enumerate(chunks):
    df = chunk
    df['cat_freq'] = df['categories'].map(cat_dict)
    df['journal_freq'] = df['journal-ref'].map(journal_dict)
    df['date_freq'] = df['update_date'].map(year_dict)
    df = df.loc[df['cat_freq']>1]
    print(df.shape)
    df = df.loc[df['journal_freq']>1]
    print(df.shape)
    df = df.loc[df['date_freq']>1]
    print(df.shape, '\n')
    if i == 0:
        filtered_df = df
    else:
        filtered_df = pd.concat([filtered_df, df])
    

(49411, 17)
(430, 17)
(430, 17) 

(49374, 17)
(292, 17)
(292, 17) 

(49324, 17)
(172, 17)
(172, 17) 

(49101, 17)
(283, 17)
(283, 17) 

(48923, 17)
(134, 17)
(133, 17) 

(48863, 17)
(97, 17)
(97, 17) 

(48949, 17)
(126, 17)
(126, 17) 

(48897, 17)
(83, 17)
(83, 17) 

(49045, 17)
(119, 17)
(119, 17) 

(49099, 17)
(96, 17)
(96, 17) 

(49096, 17)
(68, 17)
(68, 17) 

(49039, 17)
(55, 17)
(55, 17) 

(49035, 17)
(63, 17)
(63, 17) 

(49065, 17)
(80, 17)
(80, 17) 

(49068, 17)
(122, 17)
(122, 17) 

(49074, 17)
(165, 17)
(165, 17) 

(49000, 17)
(201, 17)
(201, 17) 

(48978, 17)
(195, 17)
(195, 17) 

(48903, 17)
(316, 17)
(316, 17) 

(48965, 17)
(371, 17)
(371, 17) 

(49035, 17)
(406, 17)
(406, 17) 

(49046, 17)
(500, 17)
(499, 17) 

(48841, 17)
(576, 17)
(576, 17) 

(48627, 17)
(526, 17)
(526, 17) 

(48742, 17)
(580, 17)
(580, 17) 

(48627, 17)
(539, 17)
(539, 17) 

(48702, 17)
(554, 17)
(554, 17) 

(48764, 17)
(577, 17)
(577, 17) 

(48707, 17)
(563, 17)
(563, 17) 

(48728, 17)
(553, 17)
(553, 

In [151]:
filtered_data.to_pickle('filtered_data.pickle')

NameError: name 'filtered_data' is not defined

In [138]:
loaded_filtered_df = filter_df_with_pickle('../data/arxiv-metadata-oai-snapshot.json', 'filtered_data.pickle')
print(loaded_filtered_df.shape)
loaded_filtered_df.head()

             id                   submitter  \
0      704.0001              Pavel Nadolsky   
1      704.0002                Louis Theran   
2      704.0003                 Hongjun Pan   
3      704.0004                David Callan   
4      704.0005          Alberto Torchinsky   
...         ...                         ...   
49995  802.2698     Galina L. Klimchitskaya   
49996  802.2699  Farhad  Jafarpour Hamadani   
49997  802.2700             Alessia Mandini   
49998  802.2701                  Lifeng Lai   
49999  802.2702            Gian Paolo Vacca   

                                                 authors  \
0      C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                        Ileana Streinu and Louis Theran   
2                                            Hongjun Pan   
3                                           David Callan   
4               Wael Abu-Shammala and Alberto Torchinsky   
...                                                  ...   
49995  G. L. Kl

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [None]:
upto = 1000
counter = 0
for k, v in author_dict.items():
    # if len(k)>30: continue
    counter+=1
    print("{:20} : {}".format(k.replace("\n", ""), v))
    if counter==upto: break