"alg-geom" : Algebraic Geometry (math.AG) 

"dg-ga" : Differential Geometry (math.DG)

"q-alg" : Quantum Algebra (math.QA)

"patt-sol" : Pattern Formation and Solitons (nlin.PS); 

"adap-org" : Adaptation and Self-Organizing Systems (nlin.AO)

"solv-int" : Exactly Solvable and Integrable Systems (nlin.SI) 

"chao-dyn" : Chaotic Dynamics (nlin.CD) 

"comp-gas" : Cellular Automata and Lattice Gases (nlin.CG) 

"chem-ph" : Chemical Physics (physics.chem-ph) 

"mtrl-th" : Materials Science (cond-mat.mtrl-sci)

"cmp-lg" : Computation and Language (cs.CL)

"atom-ph" : Atomic Physics (physics.atom-ph) 

"funct-an" : Functional Analysis (math.FA)

"acc-phys" : Accelerator Physics (physics.acc-ph) 

In [6]:
from bs4 import BeautifulSoup

import time

import urllib
import urllib.request
from urllib.error import HTTPError

import io
import re
from math import floor
import random
import pandas as pd
import numpy as np

In [2]:
# Finding all the categories

url = "http://export.arxiv.org/oai2?verb=ListSets"
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()
soup = BeautifulSoup(text, 'xml')
all_cat = [sp.text for sp in soup.findAll("setSpec")]

f = open("all_cat_v01.txt", "w")
f.write(",".join(all_cat))
f.close()

In [3]:
all_cat

['cs',
 'econ',
 'eess',
 'math',
 'physics',
 'physics:astro-ph',
 'physics:cond-mat',
 'physics:gr-qc',
 'physics:hep-ex',
 'physics:hep-lat',
 'physics:hep-ph',
 'physics:hep-th',
 'physics:math-ph',
 'physics:nlin',
 'physics:nucl-ex',
 'physics:nucl-th',
 'physics:physics',
 'physics:quant-ph',
 'q-bio',
 'q-fin',
 'stat']

In [16]:
def scrape(cat):
    
    # Initialization
    df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = base_url + "set={}&metadataPrefix=arXiv".format(cat)
    
    # while loop in order to loop through all the resutls
    while True:
        # print url to keep track of stuff
        print(url)
        # accessing the url
        try:
            u = urllib.request.urlopen(url, data = None)
        except HTTPError as e:
            # Incase of some error that require us to wait
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))
                time.sleep(to)
                continue # Skip this loop, continue to the next one
            else:
                raise

        # reading the file
        f = io.TextIOWrapper(u,encoding='utf-8')
        text = f.read()
        soup = BeautifulSoup(text, 'xml')

        # collecting the data
        for record in soup.findAll("record"):
            try:
                doi = record.find("identifier").text
            except:
                doi = np.nan
            
            try:
                date = record.find("created").text
            except:
                date = np.nan
            
            try:
                title = record.find("title").text
            except:
                title = np.nan
            
            try:
                authors = ";".join([author.get_text(" ") for author in record.findAll("author")])
            except:
                authros = np.nan
            
            try:
                category = record.find("setSpec").text
            except:
                category = np.nan
                
            df = df.append({"doi":doi, "date":date, "title":title, "authors":authors, "category":category}, ignore_index=True)
                

        # Seeing if there is still data

        token = soup.find("resumptionToken")
        if token is None or token.text is None:
            break
        else:
            url = base_url + "resumptionToken=%s"%(token.text)
        
    return(df)

In [None]:
master_df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
for i in all_cat:
    print("----------------",i,"-------------------")
    df = scrape(i)
    master_df = master_df.append(df, ignore_index = True)

---------------- cs -------------------
http://export.arxiv.org/oai2?verb=ListRecords&set=cs&metadataPrefix=arXiv


In [None]:
master_df.head()

In [None]:
master_df.to_csv("data_v3.csv")