"alg-geom" : Algebraic Geometry (math.AG) 

"dg-ga" : Differential Geometry (math.DG)

"q-alg" : Quantum Algebra (math.QA)

"patt-sol" : Pattern Formation and Solitons (nlin.PS); 

"adap-org" : Adaptation and Self-Organizing Systems (nlin.AO)

"solv-int" : Exactly Solvable and Integrable Systems (nlin.SI) 

"chao-dyn" : Chaotic Dynamics (nlin.CD) 

"comp-gas" : Cellular Automata and Lattice Gases (nlin.CG) 

"chem-ph" : Chemical Physics (physics.chem-ph) 

"mtrl-th" : Materials Science (cond-mat.mtrl-sci)

"cmp-lg" : Computation and Language (cs.CL)

"atom-ph" : Atomic Physics (physics.atom-ph) 

"funct-an" : Functional Analysis (math.FA)

"acc-phys" : Accelerator Physics (physics.acc-ph) 

In [28]:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import io
import re
from math import floor
import random
import pandas as pd

In [20]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def query_cat2(cat, max_res=500):
    # MXL still use some of the old code, this dict help to correct it
    correct_dict = {"alg-geom" : "math", "dg-ga" : "math", "q-alg" : "math", "patt-sol" : "nlin", \
                "adap-org" : "nlin", "solv-int" : "nlin", "chao-dyn" : "nlin", "comp-gas" : "nlin", \
                "chem-ph" : "physics", "mtrl-th" : "cond-mat", "cmp-lg" : "cs", "atom-ph" : "physics",\
                "funct-an" : "math", "acc-phys" : "physics", "bayes-an" : "physics"}

    url = "http://export.arxiv.org/api/query?search_query=cat:{}&max_results={}&sortBy=relevance&sortOrder=ascending".format(cat,max_res)

#     print(url)
    # Fetch the url's HTML code
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()

    # Using beautiful soup to read 
    soup = BeautifulSoup(text, 'xml')

    au_list = []
    paper_lst = []
    for i in soup.findAll("entry"):
        # iterate through each paper
        title = i.title.contents[0]
        paper_id = i.id.contents[0]

        # Author part
        au = [] # Initialize the list of authors
        for j in i.findAll("author"):
            au.append(j.find("name").contents[0]) # Append to the list of authors
        au = ";".join(au)

        # Paper part
        # Get the primary category of each paper
        try:
            prim_cat = re.search("^(.+)\.", i.primary_category["term"]).group(1)
    #         print( i.primary_category["term"])
        except:
            prim_cat = i.primary_category["term"]
            
        if(prim_cat in correct_dict.keys()):
            prim_cat = correct_dict[prim_cat]
            
        paper_lst.append([paper_id, title, au, prim_cat])
    return paper_lst

In [21]:
query_cat2("comp-gas", 5)

[['http://arxiv.org/abs/comp-gas/9302001v1',
  'Some comments on the correlation dimension of $1/f^α$ noise',
  'James Theiler',
  'nlin'],
 ['http://arxiv.org/abs/comp-gas/9302002v1',
  "Statistical error in a chord estimator of correlation dimension: the\n  ``rule of five''",
  'James Theiler;Turab Lookman',
  'nlin'],
 ['http://arxiv.org/abs/comp-gas/9302003v2',
  'Detecting Nonlinearity in Data with Long Coherence Times',
  'James Theiler;Paul S. Linsay;David M. Rubin',
  'nlin'],
 ['http://arxiv.org/abs/comp-gas/9303001v2',
  'A Lattice Boltzmann Model for Multi-phase Fluid Flows',
  'Daryl Grunau;Shiyi Chen;Kenneth Egger',
  'nlin'],
 ['http://arxiv.org/abs/comp-gas/9303002v1',
  'Message-Passing Multi-Cell Molecular Dynamics on the Connection Machine\n  5',
  'D. M. Beazley;P. S. Lomdahl',
  'nlin']]

In [23]:
# Getting all the categories

# The URL
url = "https://arxiv.org/help/api/user-manual#subject_classifications"

# Fetch the url's HTML code
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()

# Using beautiful soup to read 
soup = BeautifulSoup(text, 'html.parser')

# Using beautiful soup to find all the table
table_lst = soup.findAll('table')

cat_lst_gen = [] # General category list
cat_lst_spe = [] # Specific category list
cat_dict = {}
for (index, specific) in enumerate(table_lst[-1].findAll("td")):
    if(index%2 or index ==0):
        pass
    else:
#         print(index)
        # only get the abbreviation
        specific = specific.contents[0].strip()
        
        try:
            general = re.search("^(.+)\.", specific).group(1)
        except:
            general = specific
        
        if general in cat_dict.keys():
            if specific not in cat_dict[general]:
                cat_dict[general].append(specific)
        else:
            cat_dict.update({general: [specific]})

# Manually add a new category
cat_dict.update({"econ" : ["econ.EM"]})
cat_dict.update({"eess" : ["eess.AS", "eess.IV", "eess.SP"]})
cat_dict.update({"q-fin" : ["q-fin.CP", "q-fin.EC", "q-fin.GN", "q-fin.MF", "q-fin.PM",\
                               "q-fin.PR", "q-fin.RM", "q-fin.ST", "q-fin.TR"]})
            
all_cat = [item for subcategory in cat_dict.values() for item in subcategory]

In [None]:
master_list = []
# fetch the list of authors
for (i, cat) in enumerate(all_cat):
    master_list.append(query_cat2(cat, max_res=1000))

master_list_flat = flatten(master_list)

In [None]:
df = pd.DataFrame(master_list_flat, columns = ["paper ID", "Title", "Authors", "Category"])

In [None]:
df.to_csv("data.csv", index = False, encoding = "utf-8")