In [12]:
import time
import os
import random
import logging
# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def get_arxiv_dummy_data():
    try:
        delay = random.randint(1, 3)  # 随机延迟 1 到 10 秒
        time.sleep(delay)  # 模拟网络延迟
        # Get the path to the local file
        file_path = os.path.join(os.getcwd(), "arxiv_dummy_data.xml")
        # Check if file exists
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            raise FileNotFoundError(f"Could not find arxiv_dummy_data.xml at {file_path}")
        # Read and validate file content
        with open(file_path, "r", encoding="utf-8") as file:
            response = file.read()
            
        if not response:
            logger.error("File is empty")
            raise ValueError("arxiv_dummy_data.xml is empty")
            
        logger.info(f"Successfully read {len(response)} bytes from file")
        return response
    
    except Exception as e:
        logger.error(f"Error simulating network delay: {e}")

In [13]:
dummy_data = get_arxiv_dummy_data()
if dummy_data:
    print("Dummy data retrieved successfully.")
    print(dummy_data[:100])  # Print the first 100 characters of the dummy data
else:
    print("Failed to retrieve dummy data.")

INFO:__main__:Successfully read 131386 bytes from file


Dummy data retrieved successfully.
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <link href="ht


In [33]:
import feedparser

feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

feed = feedparser.parse(dummy_data)
logger.info(f'Feed title: %s' % feed.feed.title)
# Create a dictionary to store feed metadata
result = {
    'feed_metadata': {
        'title': feed.feed.title,
        'updated': feed.feed.updated,
        'total_results': feed.feed.opensearch_totalresults,
        'items_per_page': feed.feed.opensearch_itemsperpage,
        'start_index': feed.feed.opensearch_startindex
    },
    'papers': []
}

# Process each entry
for entry in feed.entries:
    paper = {
        'arxiv_id': entry.id,
        'title': entry.title,
        'authors': [author.name for author in entry.authors],
        'affiliation': [author.get('arxiv:affiliation', '') for author in entry.authors]
    }
    result['papers'].append(paper)


AttributeError: module 'feedparser' has no attribute '_FeedParserMixin'

In [31]:
test_entry = feed.entries[1]
print(test_entry)
print (f"Title: {test_entry.title}")
print (f"Authors: {[author.name for author in test_entry.authors]}")
print(f"authors {test_entry.authors}")
print(test_entry.author_detail)
print(test_entry.arxiv_affiliation)


{'id': 'http://arxiv.org/abs/1805.02867v2', 'guidislink': True, 'link': 'http://arxiv.org/abs/1805.02867v2', 'updated': '2018-07-28T06:51:27Z', 'updated_parsed': time.struct_time(tm_year=2018, tm_mon=7, tm_mday=28, tm_hour=6, tm_min=51, tm_sec=27, tm_wday=5, tm_yday=209, tm_isdst=0), 'published': '2018-05-08T07:34:17Z', 'published_parsed': time.struct_time(tm_year=2018, tm_mon=5, tm_mday=8, tm_hour=7, tm_min=34, tm_sec=17, tm_wday=1, tm_yday=128, tm_isdst=0), 'title': 'Online normalizer calculation for softmax', 'title_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': 'Online normalizer calculation for softmax'}, 'summary': 'The Softmax function is ubiquitous in machine learning, multiple previous\nworks suggested faster alternatives for it. In this paper we propose a way to\ncompute classical Softmax with fewer memory accesses and hypothesize that this\nreduction in memory accesses should improve Softmax performance on actual\nhardware. The benchmarks confirm this

In [32]:
import xml.etree.ElementTree as ET

xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <link href="http://arxiv.org/api/query?search_query%3Dau%3ANVIDIA%26id_list%3D%26start%3D0%26max_results%3D100" rel="self" type="application/atom+xml"/>
    <title type="html">ArXiv Query: search_query=au:NVIDIA&amp;id_list=&amp;start=0&amp;max_results=100</title>
    <id>http://arxiv.org/api/FDHFT2fjT+rKbUJuyXDF3N1b4Zk</id>
    <updated>2025-03-28T00:00:00-04:00</updated>
    <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">32</opensearch:totalResults>
    <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
    <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">100</opensearch:itemsPerPage>
    <entry>
        <id>http://arxiv.org/abs/1504.01441v3</id>
        <updated>2015-05-05T00:15:00Z</updated>
        <published>2015-04-07T00:29:54Z</published>
        <title>Locally Non-rigid Registration for Mobile HDR Photography</title>
        <summary>  Image registration for stack-based HDR photography is challenging. If not
properly accounted for, camera motion and scene changes result in artifacts in
the composite image. Unfortunately, existing methods to address this problem
are either accurate, but too slow for mobile devices, or fast, but prone to
failing. We propose a method that fills this void: our approach is extremely
fast---under 700ms on a commercial tablet for a pair of 5MP images---and
prevents the artifacts that arise from insufficient registration quality.
</summary>
        <author>
            <name>Orazio Gallo</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <author>
            <name>Alejandro Troccoli</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <author>
            <name>Jun Hu</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">Duke University</arxiv:affiliation>
        </author>
        <author>
            <name>Kari Pulli</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">Light</arxiv:affiliation>
        </author>
        <author>
            <name>Jan Kautz</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <link href="http://arxiv.org/abs/1504.01441v3" rel="alternate" type="text/html"/>
        <link title="pdf" href="http://arxiv.org/pdf/1504.01441v3" rel="related" type="application/pdf"/>
        <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CV" scheme="http://arxiv.org/schemas/atom"/>
        <category term="cs.CV" scheme="http://arxiv.org/schemas/atom"/>
    </entry>
</feed>
"""

# 定义命名空间映射：注意默认的 Atom 命名空间可以起一个别名，例如 "atom"
ns = {
    "atom": "http://www.w3.org/2005/Atom",
    "opensearch": "http://a9.com/-/spec/opensearch/1.1/",
    "arxiv": "http://arxiv.org/schemas/atom"
}

# 解析 XML
root = ET.fromstring(xml_data)

# 获取 feed 信息
feed_title = root.find("atom:title", ns).text
updated = root.find("atom:updated", ns).text
total_results = root.find("opensearch:totalResults", ns).text

print("Feed Title:", feed_title)
print("Updated:", updated)
print("Total Results:", total_results)

# 遍历所有 entry 元素
for entry in root.findall("atom:entry", ns):
    paper_id = entry.find("atom:id", ns).text
    title = entry.find("atom:title", ns).text
    published = entry.find("atom:published", ns).text
    summary = entry.find("atom:summary", ns).text

    # 提取所有 author 信息
    authors = []
    affiliations = []  # 收集所有 affiliation 数据，注意有的 author 可能有多个
    for author in entry.findall("atom:author", ns):
        name = author.find("atom:name", ns).text
        authors.append(name)
        # 注意 feedparser 可能会将 arxiv:affiliation 元素转换为 “arxiv_affiliation”
        # 但使用 ElementTree 时需使用命名空间查询
        for aff in author.findall("arxiv:affiliation", ns):
            affiliations.append(aff.text)

    print("Paper ID:", paper_id)
    print("Title:", title)
    print("Published:", published)
    print("Summary:", summary.strip())
    print("Authors:", authors)
    print("Affiliations:", affiliations)

Feed Title: ArXiv Query: search_query=au:NVIDIA&id_list=&start=0&max_results=100
Updated: 2025-03-28T00:00:00-04:00
Total Results: 32
Paper ID: http://arxiv.org/abs/1504.01441v3
Title: Locally Non-rigid Registration for Mobile HDR Photography
Published: 2015-04-07T00:29:54Z
Summary: Image registration for stack-based HDR photography is challenging. If not
properly accounted for, camera motion and scene changes result in artifacts in
the composite image. Unfortunately, existing methods to address this problem
are either accurate, but too slow for mobile devices, or fast, but prone to
failing. We propose a method that fills this void: our approach is extremely
fast---under 700ms on a commercial tablet for a pair of 5MP images---and
prevents the artifacts that arise from insufficient registration quality.
Authors: ['Orazio Gallo', 'Alejandro Troccoli', 'Jun Hu', 'Kari Pulli', 'Jan Kautz']
Affiliations: ['NVIDIA', 'NVIDIA', 'NVIDIA', 'Duke University', 'NVIDIA', 'Light', 'NVIDIA']


In [76]:
import arxiv

client = arxiv.Client()

search = arxiv.Search(
    query="",
    id_list=["2405.14900v1"],
    max_results=10,
    sort_by=arxiv.SortCriterion.SubmittedDate
)
results = client.results(search)
print([r.title for r in results])

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2405.14900v1&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results


['Fair Evaluation of Federated Learning Algorithms for Automated Breast Density Classification: The Results of the 2022 ACR-NCI-NVIDIA Federated Learning Challenge']


In [71]:
all_results = client.results(search)
print([r.title for r in all_results])

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=id%3A2405.14900v1&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got empty first page; stopping generation


[]


In [77]:
for result in client.results(search):
    print(dir(result))
    print(result.Author)
    print(result.authors)
    print(result.title)
    print(result.summary)
    print(result.published)
    print(result.updated)
    print(result.entry_id)
    print(result.primary_category)
    print(result.categories)
    print(result.pdf_url)
    print(result.links)
    break

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2405.14900v1&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results


['Author', 'Link', 'MissingFieldError', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_from_feed_entry', '_get_default_filename', '_get_pdf_url', '_raw', '_to_datetime', 'authors', 'categories', 'comment', 'doi', 'download_pdf', 'download_source', 'entry_id', 'get_short_id', 'journal_ref', 'links', 'pdf_url', 'primary_category', 'published', 'summary', 'title', 'updated']
<class 'arxiv.Result.Author'>
[arxiv.Result.Author('Kendall Schmidt'), arxiv.Result.Author('Benjamin Bearce'), arxiv.Result.Author('Ken Chang'), arxiv.Result.Author('Laura Coombs'), arxiv.Result.Author('Keyvan Farahani'), arxiv.Result.Author('Marawan Elbatele'), arxiv.Result.Author('Kaouthe

In [64]:
for result in client.results(search):
    for author in result.authors:
        print(author.name)

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=au%3ANVIDIA&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 32 of 32 total results


NVIDIA
:
Alisson Azzolini
Hannah Brandon
Prithvijit Chattopadhyay
Huayu Chen
Jinju Chu
Yin Cui
Jenna Diamond
Yifan Ding
Francesco Ferroni
Rama Govindaraju
Jinwei Gu
Siddharth Gururani
Imad El Hanafi
Zekun Hao
Jacob Huffman
Jingyi Jin
Brendan Johnson
Rizwan Khan
George Kurian
Elena Lantz
Nayeon Lee
Zhaoshuo Li
Xuan Li
Tsung-Yi Lin
Yen-Chen Lin
Ming-Yu Liu
Andrew Mathau
Yun Ni
Lindsey Pavao
Wei Ping
David W. Romero
Misha Smelyanskiy
Shuran Song
Lyne Tchapmi
Andrew Z. Wang
Boxin Wang
Haoxiang Wang
Fangyin Wei
Jiashu Xu
Yao Xu
Xiaodong Yang
Zhuolin Yang
Xiaohui Zeng
Zhe Zhang
NVIDIA
:
Johan Bjorck
Fernando Castañeda
Nikita Cherniadev
Xingye Da
Runyu Ding
Linxi "Jim" Fan
Yu Fang
Dieter Fox
Fengyuan Hu
Spencer Huang
Joel Jang
Zhenyu Jiang
Jan Kautz
Kaushil Kundalia
Lawrence Lao
Zhiqi Li
Zongyu Lin
Kevin Lin
Guilin Liu
Edith Llontop
Loic Magne
Ajay Mandlekar
Avnish Narayan
Soroush Nasiriany
Scott Reed
You Liang Tan
Guanzhi Wang
Zu Wang
Jing Wang
Qi Wang
Jiannan Xiang
Yuqi Xie
Yinzhen Xu
Zhenj

In [61]:
print([r.title for r in all_results])

[]


In [None]:
import feedparser

feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

feed = feedparser.parse(dummy_data)
logger.info(f'Feed title: %s' % feed.feed.title)
# Create a dictionary to store feed metadata
result = {
    'feed_metadata': {
        'title': feed.feed.title,
        'updated': feed.feed.updated,
        'total_results': feed.feed.opensearch_totalresults,
        'items_per_page': feed.feed.opensearch_itemsperpage,
        'start_index': feed.feed.opensearch_startindex
    },
    'papers': []
}

# Process each entry
for entry in feed.entries:
    paper = {
        'arxiv_id': entry.id,
        'title': entry.title,
        'authors': [author.name for author in entry.authors],
        'affiliation': [author.get('arxiv:affiliation', '') for author in entry.authors]
    }
    result['papers'].append(paper)


AttributeError: module 'feedparser' has no attribute '_FeedParserMixin'

In [83]:
import xml.etree.ElementTree as ET

xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <link href="http://arxiv.org/api/query?search_query%3Dau%3ANVIDIA%26id_list%3D%26start%3D0%26max_results%3D100" rel="self" type="application/atom+xml"/>
    <title type="html">ArXiv Query: search_query=au:NVIDIA&amp;id_list=&amp;start=0&amp;max_results=100</title>
    <id>http://arxiv.org/api/FDHFT2fjT+rKbUJuyXDF3N1b4Zk</id>
    <updated>2025-03-28T00:00:00-04:00</updated>
    <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">32</opensearch:totalResults>
    <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
    <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">100</opensearch:itemsPerPage>
    <entry>
        <id>http://arxiv.org/abs/1504.01441v3</id>
        <updated>2015-05-05T00:15:00Z</updated>
        <published>2015-04-07T00:29:54Z</published>
        <title>Locally Non-rigid Registration for Mobile HDR Photography</title>
        <summary>  Image registration for stack-based HDR photography is challenging. If not
properly accounted for, camera motion and scene changes result in artifacts in
the composite image. Unfortunately, existing methods to address this problem
are either accurate, but too slow for mobile devices, or fast, but prone to
failing. We propose a method that fills this void: our approach is extremely
fast---under 700ms on a commercial tablet for a pair of 5MP images---and
prevents the artifacts that arise from insufficient registration quality.
</summary>
        <author>
            <name>Orazio Gallo</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <author>
            <name>Alejandro Troccoli</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <author>
            <name>Jun Hu</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">Duke University</arxiv:affiliation>
        </author>
        <author>
            <name>Kari Pulli</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">Light</arxiv:affiliation>
        </author>
        <author>
            <name>Jan Kautz</name>
            <arxiv:affiliation xmlns:arxiv="http://arxiv.org/schemas/atom">NVIDIA</arxiv:affiliation>
        </author>
        <link href="http://arxiv.org/abs/1504.01441v3" rel="alternate" type="text/html"/>
        <link title="pdf" href="http://arxiv.org/pdf/1504.01441v3" rel="related" type="application/pdf"/>
        <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.CV" scheme="http://arxiv.org/schemas/atom"/>
        <category term="cs.CV" scheme="http://arxiv.org/schemas/atom"/>
    </entry>
</feed>
"""


In [84]:
def parse_arxiv_feed(xml_data: str) -> dict:
    """
    解析 arXiv API 返回的 XML 数据，提取 feed 信息和论文条目，并返回为字典格式。

    参数:
        xml_data: str - 包含 arXiv API 响应的 XML 数据字符串。

    返回:
        dict: 包含 feed 信息和论文条目的字典，结构如下：
            {
                "feed_info": { "feed_title": ..., "updated": ..., "total_results": ... },
                "papers": [
                    {
                        "paper_id": ...,
                        "title": ...,
                        "published": ...,
                        "summary": ...,
                        "authors": [
                            {"name": ..., "affiliations": [...]},
                            ...
                        ]
                    },
                    ...
                ]
            }
    """
    ns = {
        "atom": "http://www.w3.org/2005/Atom",
        "opensearch": "http://a9.com/-/spec/opensearch/1.1/",
        "arxiv": "http://arxiv.org/schemas/atom"
    }
    root = ET.fromstring(xml_data)

    feed_info = {
        "feed_title": root.find("atom:title", ns).text,
        "updated": root.find("atom:updated", ns).text,
        "total_results": root.find("opensearch:totalResults", ns).text,
        "items_per_page": root.find("opensearch:itemsPerPage", ns).text,
        "start_index": root.find("opensearch:startIndex", ns).text
    }

    papers = []
    for entry in root.findall("atom:entry", ns):
        # 提取 primary_category
        primary_category_elem = entry.find("arxiv:primary_category", ns)
        primary_category = primary_category_elem.attrib.get("term") if primary_category_elem is not None else None

        # 提取所有 category（默认命名空间下）
        categories = []
        for cat in entry.findall("atom:category", ns):
            term = cat.attrib.get("term")
            if term:
                categories.append(term)
    
        paper = {
            "arxiv_id": entry.find("atom:id", ns).text,
            "title": entry.find("atom:title", ns).text,
            "published": entry.find("atom:published", ns).text,
            "summary": entry.find("atom:summary", ns).text.strip() if entry.find("atom:summary", ns) is not None else "",
            "authors": [],
            "primary_category": primary_category,
            "categories": categories
        }
        for author in entry.findall("atom:author", ns):
            name = author.find("atom:name", ns).text
            aff_list = [aff.text for aff in author.findall("arxiv:affiliation", ns)]
            paper["authors"].append({"name": name, "affiliations": aff_list})
        papers.append(paper)

    return {
        "feed_info": feed_info,
        "papers": papers
    }

In [85]:
print(parse_arxiv_feed(xml_data))

{'feed_info': {'feed_title': 'ArXiv Query: search_query=au:NVIDIA&id_list=&start=0&max_results=100', 'updated': '2025-03-28T00:00:00-04:00', 'total_results': '32', 'items_per_page': '100', 'start_index': '0'}, 'papers': [{'arxiv_id': 'http://arxiv.org/abs/1504.01441v3', 'title': 'Locally Non-rigid Registration for Mobile HDR Photography', 'published': '2015-04-07T00:29:54Z', 'summary': 'Image registration for stack-based HDR photography is challenging. If not\nproperly accounted for, camera motion and scene changes result in artifacts in\nthe composite image. Unfortunately, existing methods to address this problem\nare either accurate, but too slow for mobile devices, or fast, but prone to\nfailing. We propose a method that fills this void: our approach is extremely\nfast---under 700ms on a commercial tablet for a pair of 5MP images---and\nprevents the artifacts that arise from insufficient registration quality.', 'authors': [{'name': 'Orazio Gallo', 'affiliations': ['NVIDIA']}, {'name