In [4]:
import requests  
from bs4 import BeautifulSoup  
import pandas as pd  
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re
from datetime import datetime

def fetch_arxiv_data(search_query, max_results, start_date, end_date, primary_category, categories):
    api_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
    }
    if start_date:
        params["start_date"] = start_date
    if end_date:
        params["end_date"] = end_date
    if primary_category:
        params["cat"] = primary_category
    if categories:
        params["categories"] = categories

    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        feed = BeautifulSoup(response.content, features="html.parser")
        entries = feed.find_all('entry')
        articles = []

        for entry in entries:
            article = {}
            article['Title'] = entry.title.text
            article['Authors'] = [author.find('name').text for author in entry.find_all('author')]
            article['Published'] = entry.published.text
            article['Updated'] = entry.updated.text
            article['Summary'] = entry.summary.text.strip()
            article['ID'] = entry.id.text
            articles.append(article)

        df = pd.DataFrame(articles)
        return df
    else:
        print("Failed to retrieve data from arXiv API")
        return None

def extract_categories(id):
    categories = re.findall(r'abs/([a-z\-]+)', id)
    return categories

def apply_real_names(categories):
    category_mapping = {
        'astro-ph': 'Astrophysics',
        'cond-mat': 'Condensed Matter',
        'cs': 'Computer Science',
        'econ': 'Economics',
        'eess': 'Electrical Engineering and Systems Science',
        'gr-qc': 'General Relativity and Quantum Cosmology',
        'hep-ex': 'High Energy Physics - Experiment',
        'hep-lat': 'High Energy Physics - Lattice',
        'hep-ph': 'High Energy Physics - Phenomenology',
        'hep-th': 'High Energy Physics - Theory',
        'math': 'Mathematics',
        'nlin': 'Nonlinear Sciences',
        'nucl-ex': 'Nuclear Experiment',
        'nucl-th': 'Nuclear Theory',
        'physics': 'Physics',
        'q-bio': 'Quantitative Biology',
        'q-fin': 'Quantitative Finance',
        'quant-ph': 'Quantum Physics',
    }
    return [category_mapping.get(category, 'Unknown') for category in categories]

def save_to_csv(df):
    today_date = datetime.now().strftime("%Y-%m-%d")
    df.to_csv(f'arxiv_pull_{today_date}.csv', index=False)
    print(f"Data saved to arxiv_pull_{today_date}.csv")

def fetch_arxiv_and_save(search_query, max_results=10, start_date=None, end_date=None, primary_category=None, categories=None):
    arxiv_data = fetch_arxiv_data(search_query, max_results, start_date, end_date, primary_category, categories)
    if arxiv_data is not None:
        arxiv_data['Categories'] = arxiv_data['ID'].apply(extract_categories)
        arxiv_data['Real_Categories'] = arxiv_data['Categories'].apply(apply_real_names)
        save_to_csv(arxiv_data)
        return arxiv_data
    else:
        return None

# Call the function with appropriate parameters
search_query = "cat:cs OR cat:physics OR cat:math OR cat:q-bio OR cat:q-fin OR cat:stat OR cat:econ OR cat:eess OR cat:astro-ph OR cat:cond-mat OR cat:gr-qc OR cat:hep-ex OR cat:hep-lat OR cat:hep-ph OR cat:hep-th OR cat:nucl-ex OR cat:nucl-th OR cat:quant-ph"
max_results = 2000
start_date = "2024-03-01"
end_date = "2024-03-03"

fetch_arxiv_and_save(search_query, max_results, start_date, end_date)

Data saved to arxiv_pull_2024-03-03.csv


Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,Categories,Real_Categories
0,Supersymmetry Structure and Phenomena,[Nir Polonsky],2001-08-29T01:48:12Z,2001-08-29T01:48:12Z,A fairly non-technical introduction to and sur...,http://arxiv.org/abs/hep-ph/0108236v1,[hep-ph],[High Energy Physics - Phenomenology]
1,q-Boson approach to multiparticle correlations,"[D. V. Anchishkin, A. M. Gavrilik, N. Z. Iorgov]",2000-10-03T10:25:50Z,2000-10-03T10:25:50Z,An approach is proposed enabling to effectivel...,http://arxiv.org/abs/hep-ph/0010019v1,[hep-ph],[High Energy Physics - Phenomenology]
2,Gauge theory of things alive and universal dyn...,[G. Mack],1994-11-28T12:26:32Z,1994-11-28T12:26:32Z,Positing complex adaptive systems made of agen...,http://arxiv.org/abs/hep-lat/9411059v1,[hep-lat],[High Energy Physics - Lattice]
3,Solar neutrinos: global analysis and implicati...,"[John N. Bahcall, Plamen I. Krastev, Alexei Yu...",2001-03-16T00:36:02Z,2001-03-20T14:53:05Z,We present a global analysis of all the availa...,http://arxiv.org/abs/hep-ph/0103179v3,[hep-ph],[High Energy Physics - Phenomenology]
4,Hints on the power corrections from current co...,"[S. Narison, V. I. Zakharov]",2001-10-10T17:32:12Z,2001-10-19T15:22:11Z,We consider an interpretation of the recent la...,http://arxiv.org/abs/hep-ph/0110141v2,[hep-ph],[High Energy Physics - Phenomenology]
...,...,...,...,...,...,...,...,...
1995,Radial Flow in Non-Extensive Thermodynamics an...,"[Trambak Bhattacharyya, Jean Cleymans, Arvind ...",2015-07-30T09:49:01Z,2016-02-03T05:54:09Z,We expand the Tsallis distribution in a Taylor...,http://arxiv.org/abs/1507.08434v2,[],[]
1996,Effects of TMD evolution and partonic flavor o...,"[Alessandro Bacchetta, Miguel G. Echevarria, P...",2015-08-03T12:49:24Z,2015-08-03T12:49:24Z,We calculate the transverse momentum dependenc...,http://arxiv.org/abs/1508.00402v1,[],[]
1997,A short introduction to heavy-ion physics,[Sourendu Gupta],2015-08-05T17:15:54Z,2015-08-23T07:07:07Z,Heavy-ion collisions provide the only laborato...,http://arxiv.org/abs/1508.01136v2,[],[]
1998,On $m_T$ dependence of femtoscopy scales for m...,"[Yu. M. Sinyukov, V. M. Shapoval, V. Yu. Naboka]",2015-08-07T20:40:30Z,2015-08-07T20:40:30Z,"The $m_T$-dependencies of the femto-scales, th...",http://arxiv.org/abs/1508.01812v1,[],[]
