In [1]:
from bs4 import BeautifulSoup

import time

import urllib
import urllib.request
from urllib.error import HTTPError

import io
import re
from math import floor
import random
import pandas as pd
import numpy as np

In [2]:
# Finding all the categories

url = "http://export.arxiv.org/oai2?verb=ListSets"
u = urllib.request.urlopen(url, data = None)
f = io.TextIOWrapper(u,encoding='utf-8')
text = f.read()
soup = BeautifulSoup(text, 'xml')
all_cat = [sp.text for sp in soup.findAll("setSpec")]

f = open("all_cat_v01.txt", "w")
f.write(",".join(all_cat))
f.close()

In [3]:
all_cat

['cs',
 'econ',
 'eess',
 'math',
 'physics',
 'physics:astro-ph',
 'physics:cond-mat',
 'physics:gr-qc',
 'physics:hep-ex',
 'physics:hep-lat',
 'physics:hep-ph',
 'physics:hep-th',
 'physics:math-ph',
 'physics:nlin',
 'physics:nucl-ex',
 'physics:nucl-th',
 'physics:physics',
 'physics:quant-ph',
 'q-bio',
 'q-fin',
 'stat']

In [2]:
def scrape(cat):
    
    # Initialization
    df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = base_url + "set={}&metadataPrefix=arXiv".format(cat)
    
    # while loop in order to loop through all the resutls
    while True:
        # print url to keep track of stuff
        print(url)
        # accessing the url
        try:
            u = urllib.request.urlopen(url, data = None)
        except HTTPError as e:
            # Incase of some error that require us to wait
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))
                time.sleep(to)
                continue # Skip this loop, continue to the next one
            else:
                raise

        # reading the file
        f = io.TextIOWrapper(u,encoding='utf-8')
        text = f.read()
        soup = BeautifulSoup(text, 'xml')

        # collecting the data
        for record in soup.findAll("record"):
            try:
                doi = record.find("identifier").text
            except:
                doi = np.nan
            
            try:
                date = record.find("created").text
            except:
                date = np.nan
            
            try:
                title = record.find("title").text
            except:
                title = np.nan
            
            try:
                authors = ";".join([author.get_text(" ") for author in record.findAll("author")])
            except:
                authros = np.nan
            
            try:
                category = record.find("setSpec").text
            except:
                category = np.nan
                
            df = df.append({"doi":doi, "date":date, "title":title, "authors":authors, "category":category}, ignore_index=True)
                

        # Seeing if there is still data

        token = soup.find("resumptionToken")
        if token is None or token.text is None:
            break
        else:
            url = base_url + "resumptionToken=%s"%(token.text)
        
    return(df)

In [None]:
cat = "physics"

# Initialization
df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
url = base_url + "set={}&metadataPrefix=arXiv".format(cat)

# while loop in order to loop through all the resutls
while True:
    # print url to keep track of stuff
    print(url)
    # accessing the url
    try:
        u = urllib.request.urlopen(url, data = None)
    except HTTPError as e:
        # Incase of some error that require us to wait
        if e.code == 503:
            to = int(e.hdrs.get("retry-after", 30))
            print("Got 503. Retrying after {0:d} seconds.".format(to))
            time.sleep(to)
            continue # Skip this loop, continue to the next one
        else:
            raise

    # reading the file
    f = io.TextIOWrapper(u,encoding='utf-8')
    text = f.read()
    soup = BeautifulSoup(text, 'xml')

    # collecting the data
    for record in soup.findAll("record"):
        try:
            doi = record.find("identifier").text
        except:
            doi = np.nan

        try:
            date = record.find("created").text
        except:
            date = np.nan

        try:
            title = record.find("title").text
        except:
            title = np.nan

        try:
            authors = ";".join([author.get_text(" ") for author in record.findAll("author")])
        except:
            authros = np.nan

        try:
            category = record.find("setSpec").text
        except:
            category = np.nan

        df = df.append({"doi":doi, "date":date, "title":title, "authors":authors, "category":category}, ignore_index=True)


    # Seeing if there is still data

    token = soup.find("resumptionToken")
    if token is None or token.text is None:
        break
    else:
        url = base_url + "resumptionToken=%s"%(token.text)



http://export.arxiv.org/oai2?verb=ListRecords&set=physics&metadataPrefix=arXiv
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|1001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|2001
Got 503. Retrying after 10 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|2001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|3001
Got 503. Retrying after 10 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|3001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|4001
Got 503. Retrying after 10 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|4001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|5001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|6001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|7001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2

http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|104001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|105001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|106001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|107001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|108001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|109001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|110001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|111001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|112001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|113001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|114001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|115001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|116001

http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|211001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|212001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|213001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|214001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|215001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|216001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|217001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|218001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|219001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|220001
Got 503. Retrying after 600 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|220001
Got 503. Retrying after 600 seconds.
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|220001
Go

http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|310001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|311001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|312001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|313001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|314001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|315001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|316001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|317001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|318001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|319001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|320001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|321001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|322001

http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|417001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|418001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|419001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|420001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|421001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|422001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|423001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|424001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|425001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|426001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|427001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|428001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|429001

http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|524001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|525001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|526001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|527001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|528001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|529001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|530001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|531001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|532001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|533001
http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=2212497|534001


In [None]:
df_old = pd.read_csv("data.csv")

In [None]:
df = df_old.append(df, ignore_index = True)

In [None]:
df.to_csv("data_v4.csv", index = False) 

In [None]:
master_df = pd.DataFrame(columns=("doi", "date", "title", "authors", "category"))
for i in all_cat:
    print("----------------",i,"-------------------")
    df = scrape(i)
    master_df = master_df.append(df, ignore_index = True)

---------------- cs -------------------
http://export.arxiv.org/oai2?verb=ListRecords&set=cs&metadataPrefix=arXiv


In [None]:
master_df.head()

In [None]:
master_df.to_csv("data_v3.csv")