## Retrieving Arxiv Papers

In [28]:
!pip install feedparser



## Filtering Search to only Papers under the category: CS

# using OAI-MAH


In [47]:
import time
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Base OAI-PMH URL for arXiv
base_url = 'http://export.arxiv.org/oai2?verb=ListRecords'

# Search parameters for OAI-PMH (for Computer Science category and metadata format 'oai_dc')
metadata_format = 'oai_dc'
category = 'cs'  # Computer Science category
from_date = '2023-01-01'  # Start date (YYYY-MM-DD)
until_date = '2024-12-31'  # End date (YYYY-MM-DD)

# List to store the data
data = []

# Initial request URL (without a resumptionToken)
url = f'{base_url}&metadataPrefix={metadata_format}&from={from_date}&until={until_date}&set={category}'

while True:
    # Fetch the data
    response = requests.get(url)
    response_text = response.text
    
    # Parse the response using ElementTree
    root = ET.fromstring(response_text)

    # Find all the 'record' elements in the XML
    records = root.findall('.//{http://www.openarchives.org/OAI/2.0/}record')
    
    # Iterate over each record and extract relevant fields
    for record in records:
        metadata = record.find('.//{http://www.openarchives.org/OAI/2.0/oai_dc/}dc')
        
        if metadata is not None:
            title = metadata.find('.//{http://purl.org/dc/elements/1.1/}title').text if metadata.find('.//{http://purl.org/dc/elements/1.1/}title') is not None else 'N/A'
            summary = metadata.find('.//{http://purl.org/dc/elements/1.1/}description').text if metadata.find('.//{http://purl.org/dc/elements/1.1/}description') is not None else 'N/A'
            updated = record.find('.//{http://www.openarchives.org/OAI/2.0/}datestamp').text
            category = metadata.find('.//{http://purl.org/dc/elements/1.1/}subject').text if metadata.find('.//{http://purl.org/dc/elements/1.1/}subject') is not None else 'N/A'
            
            # Append the data to the list
            data.append({
                'Title': title,
                'Summary': summary,
                'Updated': updated,
                'Category': category
            })

    # Find resumptionToken for pagination (if available)
    resumption_token = root.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken')

    # If there is no resumptionToken, we have retrieved all the data
    if resumption_token is None or resumption_token.text is None:
        break

    # Update the URL with the resumptionToken for the next batch of records
    token = resumption_token.text
    url = f'{base_url}&resumptionToken={token}'

    # Sleep for 3 seconds to avoid overwhelming the server
    print("Sleeping for 3 seconds before the next request...")
    time.sleep(3)

# Convert the list of entries into a pandas DataFrame
df = pd.DataFrame(data)

# Convert the 'Updated' column to datetime format
df['Updated'] = pd.to_datetime(df['Updated'])

# Display the DataFrame
display(df)


Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...
Sleeping for 3 seconds before the next request...


Unnamed: 0,Title,Summary,Updated,Category
0,Third Order Newton's Method for Zernike Polyno...,The Zernike radial polynomials are a system ...,2024-04-23,Mathematics - Numerical Analysis
1,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices t...,2024-07-15,Computer Science - Discrete Mathematics
2,"Tur\'an Graphs, Stability Number, and Fibonacc...",The Fibonacci index of a graph is the number...,2024-03-11,Computer Science - Discrete Mathematics
3,Projective Expected Utility,Motivated by several classic decision-theore...,2024-01-18,Quantum Physics
4,A Universal In-Place Reconfiguration Algorithm...,In the modular robot reconfiguration problem...,2024-03-15,Computer Science - Computational Geometry
...,...,...,...,...
217847,On Approximately Symmetric Informationally Com...,We address the problem of constructing posit...,2023-11-27,Quantum Physics
217848,Limitations of Quantum Coset States for Graph ...,It has been known for some time that graph i...,2023-11-27,Quantum Physics
217849,Computing Local Invariants of Qubit Systems,We investigate means to describe the non-loc...,2023-11-27,Quantum Physics
217850,Fast Quantum Fourier Transforms for a Class of...,An algorithm is presented allowing the const...,2023-11-27,Quantum Physics


In [48]:
# Saving the DataFrame to a CSV file
df.to_csv('arxiv_cs_papers_2023_2024.csv', index=False)

In [53]:
display(df)

Unnamed: 0,Title,Summary,Updated,Category
0,Third Order Newton's Method for Zernike Polyno...,The Zernike radial polynomials are a system ...,2024-04-23,Mathematics - Numerical Analysis
1,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices t...,2024-07-15,Computer Science - Discrete Mathematics
2,"Tur\'an Graphs, Stability Number, and Fibonacc...",The Fibonacci index of a graph is the number...,2024-03-11,Computer Science - Discrete Mathematics
3,Projective Expected Utility,Motivated by several classic decision-theore...,2024-01-18,Quantum Physics
4,A Universal In-Place Reconfiguration Algorithm...,In the modular robot reconfiguration problem...,2024-03-15,Computer Science - Computational Geometry
...,...,...,...,...
217847,On Approximately Symmetric Informationally Com...,We address the problem of constructing posit...,2023-11-27,Quantum Physics
217848,Limitations of Quantum Coset States for Graph ...,It has been known for some time that graph i...,2023-11-27,Quantum Physics
217849,Computing Local Invariants of Qubit Systems,We investigate means to describe the non-loc...,2023-11-27,Quantum Physics
217850,Fast Quantum Fourier Transforms for a Class of...,An algorithm is presented allowing the const...,2023-11-27,Quantum Physics


In [60]:
# Get the counts of all unique entries in the 'Category' column
category_counts = df['Category'].value_counts()

# Print the counts of all unique entries in the 'Category' column
print("\nCounts of all unique entries in column 'Category':")
print(category_counts)



Counts of all unique entries in column 'Category':
Category
Computer Science - Computer Vision and Pattern Recognition    39430
Computer Science - Machine Learning                           35464
Computer Science - Computation and Language                   22805
Computer Science - Robotics                                   10894
Mathematics - Numerical Analysis                               7271
                                                              ...  
Physics - Atomic and Molecular Clusters                           2
Condensed Matter - Other Condensed Matter                         2
Nuclear Experiment                                                2
Physics - Atomic Physics                                          1
Mathematics - K-Theory and Homology                               1
Name: count, Length: 148, dtype: int64


In [64]:
# Filter entries that start with "Computer Science"
filtered_df = df[df['Category'].str.startswith('Computer Science')]

# Display the filtered DataFrame
print("\nFiltered DataFrame where 'Category' starts with 'Computer Science':")
display(filtered_df)


Filtered DataFrame where 'Category' starts with 'Computer Science':


Unnamed: 0,Title,Summary,Updated,Category
1,A recursive linear time modular decomposition ...,A module of a graph G is a set of vertices t...,2024-07-15,Computer Science - Discrete Mathematics
2,"Tur\'an Graphs, Stability Number, and Fibonacc...",The Fibonacci index of a graph is the number...,2024-03-11,Computer Science - Discrete Mathematics
4,A Universal In-Place Reconfiguration Algorithm...,In the modular robot reconfiguration problem...,2024-03-15,Computer Science - Computational Geometry
5,New probabilistic interest measures for associ...,Mining association rules is an important tec...,2024-01-01,Computer Science - Databases
6,On the `Semantics' of Differential Privacy: A ...,"Differential privacy is a definition of ""pri...",2023-01-24,Computer Science - Cryptography and Security
...,...,...,...,...
217815,Generalized Entropy Power Inequalities and Mon...,New families of Fisher information and entro...,2024-05-07,Computer Science - Information Theory
217816,Distributed Control of Microscopic Robots in B...,Current developments in molecular electronic...,2023-02-03,Computer Science - Robotics
217817,On the Complexity of the Numerically Definite ...,"In this paper, we determine the complexity o...",2024-04-19,Computer Science - Logic in Computer Science
217818,Menzerath-Altmann Law for Syntactic Structures...,"In the paper, the definition of clause suita...",2024-01-04,Computer Science - Computation and Language


In [65]:
# Convert the filtered DataFrame to a CSV file
filtered_df.to_csv('arxiv_cs_papers_2023_2024_clean', index=False)

In [26]:
# https://arxiv.org/category_taxonomy
# https://info.arxiv.org/help/api/user-manual.html#paging
# https://info.arxiv.org/help/api/examples/python_arXiv_paging_example.txt 