"alg-geom" : Algebraic Geometry (math.AG) 

"dg-ga" : Differential Geometry (math.DG)

"q-alg" : Quantum Algebra (math.QA)

"patt-sol" : Pattern Formation and Solitons (nlin.PS); 

"adap-org" : Adaptation and Self-Organizing Systems (nlin.AO)

"solv-int" : Exactly Solvable and Integrable Systems (nlin.SI) 

"chao-dyn" : Chaotic Dynamics (nlin.CD) 

"comp-gas" : Cellular Automata and Lattice Gases (nlin.CG) 

"chem-ph" : Chemical Physics (physics.chem-ph) 

"mtrl-th" : Materials Science (cond-mat.mtrl-sci)

"cmp-lg" : Computation and Language (cs.CL)

"atom-ph" : Atomic Physics (physics.atom-ph) 

"funct-an" : Functional Analysis (math.FA)

"acc-phys" : Accelerator Physics (physics.acc-ph) 

In [92]:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import io
import re
from math import floor
import random

def query_cat(cat):
    '''
    Query to find authors from a category
    '''
    url = "http://export.arxiv.org/api/query?search_query=cat:"+cat
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()

    # Using beautiful soup to read 
    soup = BeautifulSoup(text, 'xml')
    
    authors = [] # Initialize the list of authors
        
    for i in soup.findAll("author"):
        if not i in authors:
            authors.append(i.find("name").contents[0]) # Append to the list of authors       
    return authors

def query_au(author):
    '''
    Query to find the author's paper category
    '''
    
    # MXL still use some of the old code, this dict help to correct it
    correct_dict = {"alg-geom" : "math", "dg-ga" : "math", "q-alg" : "math", "patt-sol" : "nlin", \
               "adap-org" : "nlin", "solv-int" : "nlin", "chao-dyn" : "nlin", "comp-gas" : "nlin", \
               "chem-ph" : "physics", "mtrl-th" : "cond-mat", "cmp-lg" : "cs", "atom-ph" : "physics",\
               "funct-an" : "math", "acc-phys" : "physics"}
    
    url = "http://export.arxiv.org/api/query?search_query=au:"+author.replace(" ", "+")

    # Fetch the url's HTML code
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()

    # Using beautiful soup to read 
    soup = BeautifulSoup(text, 'xml')

    cat_lst = []

    for i in soup.findAll("entry"):
        # iterate through each paper

    #     # Author part
    #     au = [] # Initialize the list of authors
    #     for j in i.findAll("author"):
    #         au.append(j.find("name").contents[0]) # Append to the list of authors

        # Paper part
        # Get the primary category of each paper
        try:
            prim_cat = re.search("^(.+)\.", i.primary_category["term"]).group(1)
    #         print( i.primary_category["term"])
        except:
            prim_cat = i.primary_category["term"]
        if(prim_cat in correct_dict.keys()):
            cat_lst.append(correct_dict[prim_cat])
        else:
            cat_lst.append(prim_cat)

    return cat_lst

def list_sample(l, p = 0.1):
    l = list(l)
    return random.sample(l, int(len(l)*p))

def flatten(l):
    return [item for sublist in l for item in sublist]

In [93]:
# url = "http://export.arxiv.org/api/query?search_query=au:Kenji+Kajiwara"

# # Fetch the url's HTML code
# u = urllib.request.urlopen(url, data = None)
# f = io.TextIOWrapper(u,encoding='utf-8')
# text = f.read()

# # Using beautiful soup to read 
# soup = BeautifulSoup(text, 'xml')
# # soup

# Note on arXiv API

https://arxiv.org/help/api/user-manual#_query_interface

http://export.arxiv.org/api/{method_name}?{parameters}

method_name = query

| Parameter    | Type                    |
|--------------|-------------------------|
| search_query | string                  |
| id_list      |  comma-delimited string |
| start        |  int                    |
| max_results  | int                     |

The parameter are connected using '&'


http://export.arxiv.org/api/query?search_query=au:{author}


| Prefix | Explanation       |
|--------|-------------------|
| ti     | Title             |
| au     | Author            |
| abs    | Abstract          |
| co     | Comment           |
| jr     | Journal Reference |
| cat    | Subject Category  |
| rn     | Report Number     |
| all    | All of the above  |

In [94]:
# # Getting all the categories

# # The URL
# url = "https://arxiv.org/help/api/user-manual#subject_classifications"

# # Fetch the url's HTML code
# u = urllib.request.urlopen(url, data = None)
# f = io.TextIOWrapper(u,encoding='utf-8')
# text = f.read()

# # Using beautiful soup to read 
# soup = BeautifulSoup(text, 'html.parser')

# # Using beautiful soup to find all the table
# table_lst = soup.findAll('table')

# cat_lst_gen = [] # General category list
# cat_lst_spe = [] # Specific category list
# cat_dict = {}
# for (index, specific) in enumerate(table_lst[-1].findAll("td")):
#     if(index%2 or index ==0):
#         pass
#     else:
# #         print(index)
#         # only get the abbreviation
#         specific = specific.contents[0].strip()
        
#         try:
#             general = re.search("^(.+)\.", specific).group(1)
#         except:
#             general = specific
        
#         if general in cat_dict.keys():
#             if specific not in cat_dict[general]:
#                 cat_dict[general].append(specific)
#         else:
#             cat_dict.update({general: [specific]})

# # Manually add a new category
# cat_dict.update({"econ" : ["econ.EM"]})
# cat_dict.update({"eess" : ["eess.AS", "eess.IV", "eess.SP"]})
# cat_dict.update({"q-fin" : ["q-fin.CP", "q-fin.EC", "q-fin.GN", "q-fin.MF", "q-fin.PM",\
#                                "q-fin.PR", "q-fin.RM", "q-fin.ST", "q-fin.TR"]})
            
# all_cat = [item for subcategory in cat_dict.values() for item in subcategory]

In [95]:
# cat_file = open("cat.csv", 'w')

# cat_file.write(",".join(cat_dict.keys()))

# cat_file.close()

In [96]:
data_file = open("data.csv", 'w')

In [97]:
author_list = []
# fetch the list of authors
for (i, cat) in enumerate(all_cat):
    
#     print(i+1, " ", cat)
    author_list.append(query_cat(cat))
    
author_list = set(flatten(author_list))

In [98]:
# fetch the list of papers
header = "Author,Fields\n" 
data_file.write(header)
for (i, author) in enumerate(list_sample(author_list, p = 0.1)):
    try:
        line = author + ',' + ";".join(query_au(author)) + "\n"
        data_file.write(line)
#         data_file.write(",")
#         data_file.write(",".join(query_au(author)))
#         data_file.write("\n")
    except:
        pass

data_file.close()