In [70]:
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List
import json
import pandas as pd

from bs4 import BeautifulSoup
import feedparser

In [71]:
class ArxivRSS:
    def __init__(self, url):
        self.url = url
        self.paper_df = None

    def fetch_paper_list(self):
        feed = self._fetch_n_parse_rss()

        paper_list = []
        for rss_entry in feed["entries"]:
            paper_information = self._extract_paper_information(rss_entry)
            paper_list.append(paper_information)
        self.paper_df = pd.DataFrame(paper_list)
        return self.paper_df

    def _fetch_n_parse_rss(self):
        feed = feedparser.parse(self.url)
        return feed

    def _parse_html_element(self, raw_string):
        soup = BeautifulSoup(raw_string, "html.parser")
        return soup.text

    def _extract_paper_information(self, rss_entry):
        paper_id = rss_entry["id"]
        paper_title = rss_entry["title"]
        paper_abstract = self._parse_html_element(rss_entry["summary"])
        paper_url = rss_entry["link"]
        paper_authors = []
        for author_info in rss_entry["authors"]:
            author_name = self._parse_html_element(author_info["name"])
            paper_authors.append(author_name)
        return {
            "id": paper_id,
            "title": paper_title,
            "abstract": paper_abstract,
            "url": paper_url,
            "authors": paper_authors,
        }

In [67]:
class Judgement(BaseModel):
    topic: str = Field(description="The topic of the paper")
    relevance: float = Field(description="The relevance of the paper to the topic, a number between 0 and 1")
    reason: str = Field(description="The reason for the relevance")

class Judgements(BaseModel):
    judgement: List[Rating] = Field(description="A list of topics with relevance and reasoning")

In [58]:
data = {
    "topic1": {
        "relevance": 0,
        "reason": ""
    },
    "topic2": {
        "relevance": 0.9,
        "reason": "The paper ...."
    }
}

In [60]:
RatingDict.model_validate(data)

RatingDict(root={'topic1': Rating(relevance=0.0, reason=''), 'topic2': Rating(relevance=0.9, reason='The paper ....')})

In [31]:
client = OpenAI(timeout=60)

In [32]:
system_message = """
    You are an assistant to help the user decide if a paper is very relevant to the topics of interests.
    """

user_message = """
    Please read the following paper title and abstract:
    --------------
    Title: {title}
    Abstract: {abstract}
    --------------
    Based on the title and abstract, please rate the direct relevance of the paper with the following topics:
    --------------
    {topics}
    --------------
    For each topic, rate the relevance as a number between 0 and 1, where 0 means not relevant and 1 means very relevant.
    The paper MUST directly mention the topics to be relevant; papers with indirect relations and potential implications should have scores close to 0.
    If the paper is relevant to the topic, provide a short explanation; otherwise, leave the explanation empty.
    Use your best guess when you are not sure.
"""

In [33]:
paper = {
    "title": "Position: Restructuring of Categories and Implementation of Guidelines",
    "abstract": """The intricate and multifaceted nature of vision language model (VLM)
development, adaptation, and application necessitates the establishment of
clear and standardized reporting protocols, particularly within the high-stakes
context of healthcare. Defining these reporting standards is inherently
challenging due to the diverse nature of studies involving VLMs, which vary
significantly from the development of all new VLMs or finetuning for domain
alignment to off-the-shelf use of VLM for targeted diagnosis and prediction
tasks. In this position paper, we argue that traditional machine learning
reporting standards and evaluation guidelines must be restructured to
accommodate multiphase VLM studies; it also has to be organized for intuitive
understanding of developers while maintaining rigorous standards for
reproducibility. To facilitate community adoption, we propose a categorization
framework for VLM studies and outline corresponding reporting standards that
comprehensively address performance evaluation, data reporting protocols, and
recommendations for manuscript composition. These guidelines are organized
according to the proposed categorization scheme. Lastly, we present a checklist
that consolidates reporting standards, offering a standardized tool to ensure
consistency and quality in the publication of VLM-related research."""
}

In [34]:
with open("../config.json") as f:
    config = json.load(f)

In [65]:
response = client.responses.parse(
    model="gpt-4.1-mini",
    temperature=0.0,
    instructions=system_message,
    input=user_message.format(
        title=paper['title'],
        abstract=paper['abstract'],
        topics=config['topics']
    ),
    text_format=RatingList,
)

In [77]:
paper_judgement_df = pd.DataFrame(response.output_parsed.model_dump()['ratings'])

In [79]:
paper_judgement_df['id'] = 'oai:arXiv.org:2505.08818v1'

In [25]:
user_message.format(
        title=paper['title'],
        abstract=paper['abstract'],
        topics=config['topics'])

"\n    Please read the following paper title and abstract:\n    --------------\n    Title: Position: Restructuring of Categories and Implementation of Guidelines\n    Abstract: The intricate and multifaceted nature of vision language model (VLM)\ndevelopment, adaptation, and application necessitates the establishment of\nclear and standardized reporting protocols, particularly within the high-stakes\ncontext of healthcare. Defining these reporting standards is inherently\nchallenging due to the diverse nature of studies involving VLMs, which vary\nsignificantly from the development of all new VLMs or finetuning for domain\nalignment to off-the-shelf use of VLM for targeted diagnosis and prediction\ntasks. In this position paper, we argue that traditional machine learning\nreporting standards and evaluation guidelines must be restructured to\naccommodate multiphase VLM studies; it also has to be organized for intuitive\nunderstanding of developers while maintaining rigorous standards fo

In [27]:
response.output_parsed

In [28]:
response.output

[ParsedResponseOutputMessage[NoneType](id='msg_68264dae0e0c8191834afa3e099a0db1013b1895f6eb3685', content=[ParsedResponseOutputText[NoneType](annotations=[], text='{\n  "Security of AI and language models": 0,\n  "Applications of AI and language models in social science research": 0,\n  "Using AI to simulate humans in various contexts": 0,\n  "Methods to increase the factuality of language model response": 0,\n  "AI and language models for generating misinformation or fact-checking": 0,\n  "Methods and applications of using AI for image and video analysis": 0.7,\n  "explanations": {\n    "Methods and applications of using AI for image and video analysis": "The paper discusses vision language models (VLMs), which inherently involve image and language data, and addresses reporting standards for their development and application, including targeted diagnosis and prediction tasks that likely involve image analysis."\n  }\n}', type='output_text', parsed=None)], role='assistant', status='com

In [72]:
config

{'arxiv_rss_base_url': 'http://arxiv.org/rss/',
 'arxiv_subjects': ['cs.CY', 'cs.CL'],
 'topics': ['Security of AI and language models',
  'Applications of AI and language models in social science research',
  'Using AI to simulate humans in various contexts',
  'Methods to increase the factuality of language model response',
  'AI and language models for generating misinformation or fact-checking',
  'Methods and applications of using AI for image and video analysis'],
 'openai_model': 'gpt-4.1-mini',
 'number_of_concurrent_tasks': 50,
 'timeout_seconds': 30}

In [73]:
arxiv_subject= [""]
rss_url = config["arxiv_rss_base_url"] + config["arxiv_subjects"][0]
arss = ArxivRSS(rss_url)

In [75]:
paper_list_df = arss.fetch_paper_list()

In [80]:
paper_list_df.merge(paper_judgement_df, on='id')

Unnamed: 0,id,title,abstract,url,authors,topic,relevance,reason
0,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",Security of AI and language models,0.0,
1,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",Applications of AI and language models in soci...,0.0,
2,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",Using AI to simulate humans in various contexts,0.0,
3,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",Methods to increase the factuality of language...,0.0,
4,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",AI and language models for generating misinfor...,0.0,
5,oai:arXiv.org:2505.08818v1,Position: Restructuring of Categories and Impl...,arXiv:2505.08818v1 Announce Type: new \nAbstra...,https://arxiv.org/abs/2505.08818,"[Amara Tariq, Rimita Lahiri, Charles Kahn, Imo...",Methods and applications of using AI for image...,0.7,The paper discusses vision language models (VL...


In [82]:
for index, row in paper_list_df.iterrows():
    break

In [85]:
paper_list_df.to_dict(orient='records')

[{'id': 'oai:arXiv.org:2505.08818v1',
  'title': 'Position: Restructuring of Categories and Implementation of Guidelines Essential for VLM Adoption in Healthcare',
  'abstract': 'arXiv:2505.08818v1 Announce Type: new \nAbstract: The intricate and multifaceted nature of vision language model (VLM) development, adaptation, and application necessitates the establishment of clear and standardized reporting protocols, particularly within the high-stakes context of healthcare. Defining these reporting standards is inherently challenging due to the diverse nature of studies involving VLMs, which vary significantly from the development of all new VLMs or finetuning for domain alignment to off-the-shelf use of VLM for targeted diagnosis and prediction tasks. In this position paper, we argue that traditional machine learning reporting standards and evaluation guidelines must be restructured to accommodate multiphase VLM studies; it also has to be organized for intuitive understanding of develope