In [1]:
import feedparser
import pandas as pd
from bs4 import BeautifulSoup

In [17]:
class ArxivRSS:
    def __init__(self, url):
        self.url = url
        self.paper_df = None

    def fetch_paper_list(self):
        feed = self._fetch_n_parse_rss()

        paper_list = []
        for rss_entry in feed["entries"]:
            paper_information = self._extract_paper_information(rss_entry)
            paper_list.append(paper_information)
        self.paper_df = pd.DataFrame(paper_list)
        return self.paper_df

    def _fetch_n_parse_rss(self):
        feed = feedparser.parse(self.url)
        return feed

    def _parse_html_element(self, raw_string):
        soup = BeautifulSoup(raw_string, "html.parser")
        return soup.text

    def _extract_paper_information(self, rss_entry):
        paper_id = rss_entry["id"]
        paper_title = rss_entry["title"]
        paper_abstract = self._parse_html_element(rss_entry["summary"])
        paper_url = rss_entry["link"]
        paper_authors = []

        author_string = rss_entry["author"]
        for item in author_string.split(","):
            paper_authors.append(item.strip())

        if len(paper_authors) > 10:
            paper_authors = paper_authors[:10] + ["..."]

        return {
            "id": paper_id,
            "title": paper_title,
            "abstract": paper_abstract,
            "url": paper_url,
            "authors": paper_authors,
        }

In [18]:
rss_url = "http://arxiv.org/rss/" + "cs.CY"
arss = ArxivRSS(rss_url)

In [7]:
feed = arss._fetch_n_parse_rss()

In [16]:
for entry in feed['entries']:
    print(entry['author'])

Tim Tian Hua, James Baskerville, Henri Lemoine, Mia Hopman, Aryan Bhatt, Tyler Tracy
Mauricio Baker, Gabriel Kulp, Oliver Marks, Miles Brundage, Lennart Heim
Alejandro Cuevas, Manoel Horta Ribeiro, Nicolas Christin
Mahika Phutane, Aditya Vashistha
Bruno Scarone, Ricardo Baeza-Yates
Aparna Ananthasubramaniam, Elyse J. Thulin, Viktoryia Kalesnikava, Silas Falde, Jonathan Kertawidjaja, Lily Johns, Alejandro Rodr\'iguez-Putnam, Emma Spring, Kara Zivin, Briana Mezuk
Tanusree Sharma, Yihao Zhou, Visar Berisha
Octavian M. Machidon
Jun-Wei Zeng, Jerry Shen
Lisa Dargasz
Joydeep Chandra, Satyam Kumar Navneet
Katelyn Morrison, Arpit Mathur, Aidan Bradshaw, Tom Wartmann, Steven Lundi, Afrooz Zandifar, Weichang Dai, Kayhan Batmanghelich, Motahhare Eslami, Adam Perer
Bo Wen, Chen Wang, Qiwei Han, Raquel Norel, Julia Liu, Thaddeus Stappenbeck, Jeffrey L. Rogers
Gautam Kishore Shahi, Scot A. Hale
Shalaka Satheesh, Katrin Klug, Katharina Beckh, H\'ector Allende-Cid, Sebastian Houben, Teena Hassan
Han J

In [14]:
feed['entries'][0]['authors']

'https://arxiv.org/abs/2507.15886'

In [19]:
paper_list = arss.fetch_paper_list()

In [21]:
for index, row in paper_list.iterrows():
    print(row['authors'])

['Tim Tian Hua', 'James Baskerville', 'Henri Lemoine', 'Mia Hopman', 'Aryan Bhatt', 'Tyler Tracy']
['Mauricio Baker', 'Gabriel Kulp', 'Oliver Marks', 'Miles Brundage', 'Lennart Heim']
['Alejandro Cuevas', 'Manoel Horta Ribeiro', 'Nicolas Christin']
['Mahika Phutane', 'Aditya Vashistha']
['Bruno Scarone', 'Ricardo Baeza-Yates']
['Aparna Ananthasubramaniam', 'Elyse J. Thulin', 'Viktoryia Kalesnikava', 'Silas Falde', 'Jonathan Kertawidjaja', 'Lily Johns', "Alejandro Rodr\\'iguez-Putnam", 'Emma Spring', 'Kara Zivin', 'Briana Mezuk']
['Tanusree Sharma', 'Yihao Zhou', 'Visar Berisha']
['Octavian M. Machidon']
['Jun-Wei Zeng', 'Jerry Shen']
['Lisa Dargasz']
['Joydeep Chandra', 'Satyam Kumar Navneet']
['Katelyn Morrison', 'Arpit Mathur', 'Aidan Bradshaw', 'Tom Wartmann', 'Steven Lundi', 'Afrooz Zandifar', 'Weichang Dai', 'Kayhan Batmanghelich', 'Motahhare Eslami', 'Adam Perer']
['Bo Wen', 'Chen Wang', 'Qiwei Han', 'Raquel Norel', 'Julia Liu', 'Thaddeus Stappenbeck', 'Jeffrey L. Rogers']
['Gaut