In [None]:
import requests
from urllib.parse import quote, urlencode

You can find detailed docummentation about retrieving ArXive data via their REST API here: https://info.arxiv.org/help/api/user-manual.html

#### URL for querying ArXiv papers

In [None]:
BASE_URL = "https://export.arxiv.org/api/query"

#### Compose a Search Query

In [None]:
category_filter = "cat:cs.AI"
date_filter = "submittedDate:[202501010000+TO+202510012359]"
search_query = f"{category_filter} AND {date_filter}"

#### Configure other parameters and transform them into a query

In [None]:
params = {
    "search_query": search_query,
    "start": 0,
    "max_results": 20,
}
query = urlencode(params, quote_via=quote, safe=":+[]*")

In [None]:
query

#### Construct a URL for a GET request

In [None]:
get_url = f"{BASE_URL}?{query}"

In [None]:
get_url

#### Execute the GET request

In [None]:
response = requests.get(get_url)

In [None]:
print(response.text)

In [None]:
import xml.etree.ElementTree as ET

# Parse the XML string
root = ET.fromstring(response.text)

# Define the namespace (arXiv uses Atom namespace)
namespaces = {
    'atom': 'http://www.w3.org/2005/Atom',
    'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
    'arxiv': 'http://arxiv.org/schemas/atom'
}

# Extract total results
total_results = root.find('opensearch:totalResults', namespaces).text
print(f"Total results: {total_results}")

# Iterate through entries
entries = root.findall('atom:entry', namespaces)

papers = []
for entry in entries:
    paper = {
        'id': entry.find('atom:id', namespaces).text,
        'title': entry.find('atom:title', namespaces).text.strip(),
        'summary': entry.find('atom:summary', namespaces).text.strip(),
        'published': entry.find('atom:published', namespaces).text,
        'updated': entry.find('atom:updated', namespaces).text,
        'authors': [author.find('atom:name', namespaces).text 
                   for author in entry.findall('atom:author', namespaces)],
        'pdf_link': entry.find("atom:link[@title='pdf']", namespaces).get('href')
    }
    papers.append(paper)


In [None]:
papers

#### Download PDF files of the papers

In [None]:
def download_pdf(url, filename):
    """Download PDF from URL and save to file"""
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"PDF saved as {filename}")
    else:
        print(f"Failed to download. Status code: {response.status_code}")

In [None]:
for paper in papers:
    path = f"../../data/arxiv_papers/{paper['pdf_link'].split('/')[-1]}.pdf"
    download_pdf(paper['pdf_link'], path)