"alg-geom" : Algebraic Geometry (math.AG) 

"dg-ga" : Differential Geometry (math.DG)

"q-alg" : Quantum Algebra (math.QA)

"patt-sol" : Pattern Formation and Solitons (nlin.PS); 

"adap-org" : Adaptation and Self-Organizing Systems (nlin.AO)

"solv-int" : Exactly Solvable and Integrable Systems (nlin.SI) 

"chao-dyn" : Chaotic Dynamics (nlin.CD) 

"comp-gas" : Cellular Automata and Lattice Gases (nlin.CG) 

"chem-ph" : Chemical Physics (physics.chem-ph) 

"mtrl-th" : Materials Science (cond-mat.mtrl-sci)

"cmp-lg" : Computation and Language (cs.CL)

"atom-ph" : Atomic Physics (physics.atom-ph) 

"funct-an" : Functional Analysis (math.FA)

"acc-phys" : Accelerator Physics (physics.acc-ph) 

In [42]:
from bs4 import BeautifulSoup

import time

import urllib
import urllib.request
from urllib.error import HTTPError

import io
import re
from math import floor
import random
import pandas as pd

In [46]:
# Finding all the categories

url = "http://export.arxiv.org/oai2?verb=ListSets"
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()
soup = BeautifulSoup(text, 'xml')
all_cat = [sp.text for sp in soup.findAll("setSpec")]

f = open("all_cat_v01.txt", "w")
f.write(",".join(all_cat))
f.close()

In [68]:
all_cat

['cs',
 'econ',
 'eess',
 'math',
 'physics',
 'physics:astro-ph',
 'physics:cond-mat',
 'physics:gr-qc',
 'physics:hep-ex',
 'physics:hep-lat',
 'physics:hep-ph',
 'physics:hep-th',
 'physics:math-ph',
 'physics:nlin',
 'physics:nucl-ex',
 'physics:nucl-th',
 'physics:physics',
 'physics:quant-ph',
 'q-bio',
 'q-fin',
 'stat']

In [86]:
def scrape(cat):
    
    # Initialization
    df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = base_url + "set={}&metadataPrefix=arXiv".format(cat)
    
    # while loop in order to loop through all the resutls
    while True:
        # print url to keep track of stuff
        print(url)
        # accessing the url
        try:
            u = urllib.request.urlopen(url, data = None)
        except HTTPError as e:
            # Incase of some error that require us to wait
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))
                time.sleep(to)
                continue # Skip this loop, continue to the next one
            else:
                raise

        # reading the file
        f = io.TextIOWrapper(u,encoding='utf-8')
        text = f.read()
        soup = BeautifulSoup(text, 'xml')

        # collecting the data
        for record in soup.findAll("record"):
            doi = record.find("identifier").text
            date = record.find("created").text
            title = record.find("title").text
            authors = ";".join([author.get_text(" ") for author in record.findAll("author")])
            category = record.find("setSpec").text
            df = df.append({"doi":doi, "date":date, "title":title, "authors":authors, "category":category}, ignore_index=True)


        # Seeing if there is still data

        token = soup.find("resumptionToken")
        if token is None or token.text is None:
            break
        else:
            url = base_url + "resumptionToken=%s"%(token.text)
        
    return(df)

In [98]:
master_df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
for i in all_cat:
    print(i)
    df = scrape(i)
    master_df = master_df.append(df, ignore_index = True)

cs
http://export.arxiv.org/oai2?verb=ListRecords&set=cs&metadataPrefix=arXiv
Got 503. Retrying after 10 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&set=cs&metadataPrefix=arXiv
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|1001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|2001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|3001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|4001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|5001
Got 503. Retrying after 10 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|5001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|6001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|7001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|8001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2188010|9001
http://export.arxiv.org/oai

KeyboardInterrupt: 

In [101]:
df

Unnamed: 0,doi,date,title,authors,category
0,oai:arXiv.org:0704.3649,2007-04-27,Quantile and Probability Curves Without Crossing,Chernozhukov Victor MIT;Fernandez-Val Ivan Bos...,econ
1,oai:arXiv.org:0904.2931,2009-04-19,L1-Penalized Quantile Regression in High-Dimen...,Belloni Alexandre;Chernozhukov Victor,econ
2,oai:arXiv.org:0904.3132,2009-04-20,Posterior Inference in Curved Exponential Fami...,Belloni Alexandre;Chernozhukov Victor,econ
3,oai:arXiv.org:1010.4345,2010-10-20,Sparse Models and Methods for Optimal Instrume...,Belloni Alexandre;Chen Daniel;Chernozhukov Vic...,econ
4,oai:arXiv.org:1012.1297,2010-12-06,LASSO Methods for Gaussian Instrumental Variab...,Belloni Alexandre;Chernozhukov Victor;Hansen C...,econ
5,oai:arXiv.org:1105.6154,2011-05-30,Conditional Quantile Processes based on Series...,Belloni Alexandre;Chernozhukov Victor;Chetveri...,econ
6,oai:arXiv.org:1106.5242,2011-06-26,High Dimensional Sparse Econometric Models: An...,Belloni Alexandre;Chernozhukov Victor,econ
7,oai:arXiv.org:1201.0220,2011-12-30,Inference for High-Dimensional Sparse Economet...,Belloni Alexandre;Chernozhukov Victor;Hansen C...,econ
8,oai:arXiv.org:1201.0224,2011-12-30,Inference on Treatment Effects After Selection...,Belloni Alexandre;Chernozhukov Victor;Hansen C...,econ
9,oai:arXiv.org:1212.0442,2012-12-03,Some New Asymptotic Theory for Least Squares S...,Belloni Alexandre;Chernozhukov Victor;Chetveri...,econ


In [None]:
master_df.to_csv("data_v3.csv")