In [1]:
import openreview
from datetime import datetime
import re
from typing import Union, List
import requests
import time
import json
from tqdm import tqdm
from bs4 import BeautifulSoup


# 适用于 API V2 的客户端实例
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username='zxy_zds@mail.ustc.edu.cn',     #<your-username>
    password='Zxy@20040316'      #<your-password>
)

In [2]:
get_venues = lambda client: client.get_group(id='venues').members
venues = get_venues(client)
# print(venues) 


# 获取会议的会议信息
['ICLR.cc/2025/Conference',
 'ICLR.cc/2025/Workshop_Proposals',
 'ICLR.cc/2025/BlogPosts'
]

['ICLR.cc/2025/Conference',
 'ICLR.cc/2025/Workshop_Proposals',
 'ICLR.cc/2025/BlogPosts']

In [3]:
def get_submissions(client, venue_id, status='all'):
    # Retrieve the venue group information
    venue_group = client.get_group(venue_id)
    
    # Define the mapping of status to the respective content field
    status_mapping = {
        "all": venue_group.content['submission_name']['value'],
        "accepted": venue_group.id,  # Assuming 'accepted' status doesn't have a direct field
        "under_review": venue_group.content['submission_venue_id']['value'],
        "withdrawn": venue_group.content['withdrawn_venue_id']['value'],
        "desk_rejected": venue_group.content['desk_rejected_venue_id']['value']
    }
    # Fetch the corresponding submission invitation or venue ID
    if status in status_mapping:
        if status == "all":
            # Return all submissions regardless of their status
            return client.get_all_notes(invitation=f'{venue_id}/-/{status_mapping[status]}')
        
        # For all other statuses, use the content field 'venueid'
        return client.get_all_notes(content={'venueid': status_mapping[status]})
    
    raise ValueError(f"Invalid status: {status}. Valid options are: {list(status_mapping.keys())}")

In [4]:
venue_id = 'ICLR.cc/2025/Conference'
submissions = get_submissions(client, venue_id, 'under_review')
# print(submissions[0])

Getting V2 Notes: 100%|█████████▉| 10788/10799 [00:15<00:00, 700.27it/s]


In [None]:
def extract_comments(id):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    }       
    url = f"https://api2.openreview.net/notes?details=writable%2Csignatures%2Cinvitation%2Cpresentation%2Ctags&domain=ICLR.cc%2F2025%2FConference&forum={id}&limit=1000&trash=true"
    response = requests.get(url, headers=headers, timeout=6)
    response.raise_for_status()
    notes = response.json()['notes']
    comments = []
    for note in notes:
        comments.append(note['content'])
    return  comments[:-1]

In [6]:
def extract_submission_info(submission):
    # Helper function to convert timestamps to datetime
    def convert_timestamp_to_date(timestamp):
        return datetime.fromtimestamp(timestamp / 1000).strftime('%Y-%m-%d') if timestamp else None
    
    # Extract the required information
    submission_info = {
        'id': submission.id,
        'title': submission.content['title']['value'],
        'abstract': submission.content['abstract']['value'],
        'keywords': submission.content['keywords']['value'],
        'primary_area': submission.content['primary_area']['value'],
        'TLDR': submission.content['TLDR']['value'] if 'TLDR' in submission.content else "",
        'creation_date': convert_timestamp_to_date(submission.cdate),
        'original_date': convert_timestamp_to_date(submission.odate),
        'modification_date': convert_timestamp_to_date(submission.mdate),
        'forum_link': f"https://openreview.net/forum?id={submission.id}",
        'pdf_link': f"https://openreview.net/pdf?id={submission.id}",
        'comments': extract_comments(submission.id),
    }
    return submission_info

In [7]:
venue_id = 'ICLR.cc/2025/Conference'
output_dir = '/home/xyzhi/Desktop/conference_summary/papers/ICRL_2025'
submissions = get_submissions(client, venue_id, 'under_review')
for sub in tqdm(submissions):
    retries = 0  # 当前重试次数
    success = False  # 标记当前提交是否成功处理
    MAX_RETRIES = 3  # 最大重试次数
    RETRY_DELAY = 3  # 重试延迟时间（秒）
    while retries < MAX_RETRIES and not success:
        try:
            info = extract_submission_info(sub)
            with open(f"{output_dir}/{info['id']}.json", 'w') as f:
                json.dump(info, f, indent=4) 
            success = True # 成功处理提交
            time.sleep(RETRY_DELAY)  # 等待一段时间再处理下一个提交
           
        except Exception as e:
            retries += 1  # 增加重试次数
            print(f"处理提交 {sub.id} 时发生错误: {e} (重试 {retries}/{MAX_RETRIES})")
            time.sleep(RETRY_DELAY)  # 等待一段时间再重试

Getting V2 Notes: 100%|█████████▉| 10788/10799 [00:11<00:00, 915.73it/s]
 12%|█▏        | 1246/10799 [1:28:25<11:16:45,  4.25s/it]

处理提交 suz4utPr9Y 时发生错误: HTTPSConnectionPool(host='api2.openreview.net', port=443): Max retries exceeded with url: /notes?details=writable%2Csignatures%2Cinvitation%2Cpresentation%2Ctags&domain=ICLR.cc%2F2025%2FConference&forum=suz4utPr9Y&limit=1000&trash=true (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fea4404fa10>, 'Connection to api2.openreview.net timed out. (connect timeout=None)')) (重试 1/3)


 27%|██▋       | 2961/10799 [5:54:36<15:38:40,  7.19s/it] 


KeyboardInterrupt: 

In [14]:
import os
json_files = [f.split('.')[0] for f in os.listdir(output_dir) if f.endswith('.json')]


for sub in tqdm(submissions):
    if sub.id in json_files:
        continue
    retries = 0  # 当前重试次数
    success = False  # 标记当前提交是否成功处理
    MAX_RETRIES = 3  # 最大重试次数
    RETRY_DELAY = 3  # 重试延迟时间（秒）
    while retries < MAX_RETRIES and not success:
        try:
            info = extract_submission_info(sub)
            with open(f"{output_dir}/{info['id']}.json", 'w') as f:
                json.dump(info, f, indent=4) 
            success = True # 成功处理提交
            time.sleep(RETRY_DELAY)  # 等待一段时间再处理下一个提交
           
        except Exception as e:
            retries += 1  # 增加重试次数
            print(f"处理提交 {sub.id} 时发生错误: {e} (重试 {retries}/{MAX_RETRIES})")
            time.sleep(RETRY_DELAY)  # 等待一段时间再重试

  0%|          | 0/10799 [00:00<?, ?it/s]

100%|██████████| 10799/10799 [00:14<00:00, 733.55it/s] 
