In [31]:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import io
import re
from math import floor
import random

def query_cat(cat):
    '''
    Query to find authors from a category
    '''
    url = "http://export.arxiv.org/api/query?search_query=cat:"+cat
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()

    # Using beautiful soup to read 
    soup = BeautifulSoup(text, 'xml')
    
    authors = [] # Initialize the list of authors
        
    for i in soup.findAll("author"):
        if not i in authors:
            authors.append(i.find("name").contents[0]) # Append to the list of authors       
    return authors

def query_au(author):
    '''
    Query to find the author's paper category
    '''
    
    url = "http://export.arxiv.org/api/query?search_query=au:"+author.replace(" ", "+")

    # Fetch the url's HTML code
    u = urllib.request.urlopen(url, data = None)
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()

    # Using beautiful soup to read 
    soup = BeautifulSoup(text, 'xml')

    cat_lst = []

    for i in soup.findAll("entry"):
        # iterate through each paper

    #     # Author part
    #     au = [] # Initialize the list of authors
    #     for j in i.findAll("author"):
    #         au.append(j.find("name").contents[0]) # Append to the list of authors

        # Paper part
        # Get the primary category of each paper
        try:
            prim_cat = re.search("^(.+)\.", i.primary_category["term"]).group(1)
    #         print( i.primary_category["term"])
        except:
            prim_cat = i.primary_category["term"]

        cat_lst.append(prim_cat)

    return cat_lst

def list_sample(l, p = 0.1):
    l = list(l)
    return random.sample(l, int(len(l)*p))

def flatten(l):
    return [item for sublist in l for item in sublist]

In [40]:
query_cat("econ.EM")

['Iván Fernández-Val',
 'Martin Weidner',
 'Johan Vikström',
 'Geert Ridder',
 'Martin Weidner',
 'Yu-Wei Hsieh',
 'Xiaoxia Shi',
 'Matthew Shum',
 'Ismael Mourifie',
 'Marc Henry',
 'Romuald Meango',
 'Manjesh K. Hanawal',
 'Shashank Mishra',
 'Yezekael Hayel',
 'Victor Chernozhukov',
 'Alfred Galichon',
 'Marc Henry',
 'Brendan Pass',
 'Theodoros Chatzivasileiadis',
 'Ida Johnsson',
 'Hyungsik Roger Moon',
 'Laura Liu',
 'Hyungsik Roger Moon',
 'Frank Schorfheide',
 'Eleonora Granziera',
 'Hyungsik Roger Moon',
 'Frank Schorfheide']

In [36]:
url = "http://export.arxiv.org/api/query?search_query=au:Stefan+Johansson"

# Fetch the url's HTML code
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()

# Using beautiful soup to read 
soup = BeautifulSoup(text, 'xml')
# soup

# Note on arXiv API

https://arxiv.org/help/api/user-manual#_query_interface

http://export.arxiv.org/api/{method_name}?{parameters}

method_name = query

| Parameter    | Type                    |
|--------------|-------------------------|
| search_query | string                  |
| id_list      |  comma-delimited string |
| start        |  int                    |
| max_results  | int                     |

The parameter are connected using '&'


http://export.arxiv.org/api/query?search_query=au:{author}


| Prefix | Explanation       |
|--------|-------------------|
| ti     | Title             |
| au     | Author            |
| abs    | Abstract          |
| co     | Comment           |
| jr     | Journal Reference |
| cat    | Subject Category  |
| rn     | Report Number     |
| all    | All of the above  |

In [4]:
# Getting all the categories

# The URL
url = "https://arxiv.org/help/api/user-manual#subject_classifications"

# Fetch the url's HTML code
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()

# Using beautiful soup to read 
soup = BeautifulSoup(text, 'html.parser')

# Using beautiful soup to find all the table
table_lst = soup.findAll('table')

cat_lst_gen = [] # General category list
cat_lst_spe = [] # Specific category list
cat_dict = {}
for (index, specific) in enumerate(table_lst[-1].findAll("td")):
    if(index%2 or index ==0):
        pass
    else:
#         print(index)
        # only get the abbreviation
        specific = specific.contents[0].strip()
        
        try:
            general = re.search("^(.+)\.", specific).group(1)
        except:
            general = specific
        
        if general in cat_dict.keys():
            if specific not in cat_dict[general]:
                cat_dict[general].append(specific)
        else:
            cat_dict.update({general: [specific]})
            
all_cat = [item for subcategory in cat_dict.values() for item in subcategory]

In [37]:
cat_dict.keys()

dict_keys(['stat', 'q-bio', 'cs', 'nlin', 'math', 'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nucl-ex', 'nucl-th', 'physics', 'quant-ph'])

In [30]:
cat_dict

{'astro-ph': ['astro-ph'],
 'cond-mat': ['cond-mat.dis-nn',
  'cond-mat.mes-hall',
  'cond-mat.mtrl-sci',
  'cond-mat.other',
  'cond-mat.soft',
  'cond-mat.stat-mech',
  'cond-mat.str-el',
  'cond-mat.supr-con'],
 'cs': ['cs.AR',
  'cs.AI',
  'cs.CL',
  'cs.CC',
  'cs.CE',
  'cs.CG',
  'cs.GT',
  'cs.CV',
  'cs.CY',
  'cs.CR',
  'cs.DS',
  'cs.DB',
  'cs.DL',
  'cs.DM',
  'cs.DC',
  'cs.GL',
  'cs.GR',
  'cs.HC',
  'cs.IR',
  'cs.IT',
  'cs.LG',
  'cs.LO',
  'cs.MS',
  'cs.MA',
  'cs.MM',
  'cs.NI',
  'cs.NE',
  'cs.NA',
  'cs.OS',
  'cs.OH',
  'cs.PF',
  'cs.PL',
  'cs.RO',
  'cs.SE',
  'cs.SD',
  'cs.SC'],
 'gr-qc': ['gr-qc'],
 'hep-ex': ['hep-ex'],
 'hep-lat': ['hep-lat'],
 'hep-ph': ['hep-ph'],
 'hep-th': ['hep-th'],
 'math': ['math.AG',
  'math.AT',
  'math.AP',
  'math.CT',
  'math.CA',
  'math.CO',
  'math.AC',
  'math.CV',
  'math.DG',
  'math.DS',
  'math.FA',
  'math.GM',
  'math.GN',
  'math.GT',
  'math.GR',
  'math.HO',
  'math.IT',
  'math.KT',
  'math.LO',
  'math.MP',


In [28]:
cat_file = open("cat.csv", 'w')

cat_file.write(",".join(cat_dict.keys()))

cat_file.close()

In [5]:
# au = []
# for i in all_cat[:10]:
#     au.append(query_cat(i))
    
# au_lst = [author for sublist in au for author in sublist]

In [23]:
data_file = open("data.csv", 'w')

In [18]:
author_list = []
# fetch the list of authors
for (i, cat) in enumerate(all_cat):
    
#     print(i+1, " ", cat)
    author_list.append(query_cat(cat))
    
author_list = set(flatten(author_list))

In [24]:
# fetch the list of papers
header = "Author,Fields\n" 
data_file.write(header)
for (i, author) in enumerate(list_sample(author_list, p = 0.1)):
    try:
        line = author + ',' + ";".join(query_au(author)) + "\n"
        data_file.write(line)
#         data_file.write(",")
#         data_file.write(",".join(query_au(author)))
#         data_file.write("\n")
    except:
        pass

data_file.close()