In [46]:
### request和beautiful soup是静态的 ###

import requests
from bs4 import BeautifulSoup

# OpenReview 的 URL
url = "https://openreview.net/group?id=ACM.org/TheWebConf/2025/Conference#tab-accept-oral"

# 设置请求头，模拟浏览器请求
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# 发送 HTTP 请求获取页面内容
response = requests.get(url, headers=headers)

# 检查请求是否成功
if response.status_code != 200:
    print(f"请求失败，状态码: {response.status_code}")
    exit()

# 解析 HTML 内容
soup = BeautifulSoup(response.text, "html.parser")

# 打开一个 .txt 文件用于保存结果
with open("openreview_titles_abstracts.txt", "w", encoding="utf-8") as file:
    # 查找所有文章的链接
    paper_links = []

    papers = soup.find_all("a", href=True)  # 根据实际页面结构调整选择器

    for paper in papers:
        link = paper["href"]
        if "/forum?id=" in link:  # 确保链接是文章详情页
            paper_links.append("https://openreview.net" + link)
    print(paper_links)

    # 遍历每个文章的链接
    for link in paper_links:
        try:
            # 发送 HTTP 请求获取文章详情页内容
            paper_response = requests.get(link, headers=headers)
            if paper_response.status_code != 200:
                print(f"无法获取文章页面: {link}")
                continue

            # 解析文章详情页内容
            paper_soup = BeautifulSoup(paper_response.text, "html.parser")

            # 提取标题
            title = paper_soup.find("h1", class_="note-content-title")
            if title:
                title = title.text.strip()
            else:
                title = "未找到标题"

            # 提取摘要
            abstract = paper_soup.find("div", class_="note-content-abstract")
            if abstract:
                abstract = abstract.text.strip()
            else:
                abstract = "未找到摘要"

            # 将标题和摘要写入文件
            file.write(f"Title: {title}\n")
            file.write(f"Abstract: {abstract}\n")
            file.write("-" * 80 + "\n")
        except Exception as e:
            print(f"爬取文章 {link} 时出错: {e}")

print("数据已成功保存到 openreview_titles_abstracts.txt 文件中。")

[]
数据已成功保存到 openreview_titles_abstracts.txt 文件中。


In [53]:
### 动态 ###
### 缺点：不支持所有页面内容提取，在单页中 没有筛选的全部文章都提取了
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import string

# 配置 Selenium
options = Options()
options.add_argument("--headless")  # 无头模式
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# 创建selenium webdriver实例
driver= webdriver.Chrome()

### 只需更改此处 ###
# 目标网址  
url = "https://openreview.net/group?id=ACM.org/TheWebConf/2025/Conference"
driver.get(url)

# 等待页面加载
time.sleep(5)

# 获取所有论文的链接
papers = driver.find_elements(By.CLASS_NAME, "note")  # 根据实际页面结构调整
results = []
links = []

for paper in papers:
    # 查找包含论坛链接的 a 标签
    link_elem = paper.find_element(By.XPATH, ".//a[contains(@href, '/forum?')]")  # 选择 href 包含 /forum? 的 a 标签
    links.append(link_elem.get_attribute("href"))  # 获取链接

for i,link in zip(range(len(links)),links):     
    # 访问该文章链接
    driver.get(link)
    # print(link)
    if i % 60 != 0:
        time.sleep(6)  # 等待页面加载
    else:
        time.sleep(20)
    
        # 提取标题和摘要
    try:
        meta_tag = driver.find_element(By.CSS_SELECTOR, 'meta[name="citation_title"]')
        title = meta_tag.get_attribute("content")
        meta_tag = driver.find_element(By.CSS_SELECTOR, 'meta[name="citation_abstract"]')
        abstract = meta_tag.get_attribute("content")
    except Exception as e:
        title = "未找到标题"
        abstract = "未找到摘要"
        print(f"无法获取文章信息: {e}")
        
    # 将结果保存到列表
    results.append(f"## Title: {title}\nAbstract: {abstract}\n")
    
    # 定义非法字符映射表，将其替换为 "_"
    invalid_chars = '\\/*?:"<>|'
    translation_table = str.maketrans(invalid_chars, "_" * len(invalid_chars))
    safe_title = title.translate(translation_table)

    with open(f"./tit_ab_files/{safe_title}.md", "w", encoding="utf-8") as file:
        file.write(results[i])
    
 # 关闭浏览器
driver.quit()

# 保存到文件
with open("openreview_titles_abstracts.md", "w", encoding="utf-8") as file:
    file.write("\n".join(results))

print("数据已保存到 openreview_titles_abstracts.md")



数据已保存到 openreview_titles_abstracts.md


In [10]:
### 动态 ###
### 修改
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import string

# 配置 Selenium
options = Options()
options.add_argument("--headless")  # 无头模式
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# 创建selenium webdriver实例
driver= webdriver.Chrome()

### 只需更改此处 ###
# 目标网址  
url = "https://openreview.net/group?id=ACM.org/TheWebConf/2025/Conference"
driver.get(url)
page = 1
# 等待页面加载
time.sleep(5)

while True:
    # 获取所有论文的链接
    accept_oral_div = driver.find_element(By.CSS_SELECTOR, "div#accept-oral")
    papers = accept_oral_div.find_elements(By.CLASS_NAME, "note")  # 根据实际页面结构调整
    results = []
    links = []

    for paper in papers:
        # 查找包含论坛链接的 a 标签
        link_elem = paper.find_element(By.XPATH, ".//a[contains(@href, '/forum?')]")  # 选择 href 包含 /forum? 的 a 标签
        links.append(link_elem.get_attribute("href"))  # 获取链接

    for i,link in zip(range(len(links)),links):     
        # 访问该文章链接
        driver.get(link)
        time.sleep(5)

            # 提取标题和摘要
        try:
            meta_tag = driver.find_element(By.CSS_SELECTOR, 'meta[name="citation_title"]')
            title = meta_tag.get_attribute("content")
            meta_tag = driver.find_element(By.CSS_SELECTOR, 'meta[name="citation_abstract"]')
            abstract = meta_tag.get_attribute("content")
        except Exception as e:
            title = "未找到标题"
            abstract = "未找到摘要"
            print(f"无法获取文章信息: {e}")

        # 将结果保存到列表
        results = (f"## Title: {title}\nAbstract: {abstract}\n")

        # 定义非法字符映射表，将其替换为 "_"
        invalid_chars = '\\/*?:"<>|'
        translation_table = str.maketrans(invalid_chars, "_" * len(invalid_chars))
        safe_title = title.translate(translation_table)

        with open(f"./tit_ab_files/{safe_title}.md", "w", encoding="utf-8") as file:
            file.write(results)
        
    # 回到首页
    driver.get(url)
    time.sleep(5)
    
    # 查找“下一页”按钮
    try:
        page = page + 1
        next_button = driver.find_element(By.XPATH, f"//a[text()= '{page}']")
        right_arrow = driver.find_element(By.XPATH, "//li[@class='right-arrow']")
        if "disabled" in right_arrow.get_attribute("class"):
            print("没有更多页面，结束爬取")
            break
        else:
            next_button.click()
            time.sleep(5)
    except Exception as e:
        print(f"无法找到下一页，爬取结束，{e}")
        break
                
# 关闭浏览器
driver.quit()


无法找到下一页，爬取结束，Message: element not interactable
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF758072142+3514994]
	(No symbol) [0x00007FF757C90CE2]
	(No symbol) [0x00007FF757B374C3]
	(No symbol) [0x00007FF757B82D29]
	(No symbol) [0x00007FF757B76A0F]
	(No symbol) [0x00007FF757BA5FEA]
	(No symbol) [0x00007FF757B763B6]
	(No symbol) [0x00007FF757BA6490]
	(No symbol) [0x00007FF757BC28F6]
	(No symbol) [0x00007FF757BA5D93]
	(No symbol) [0x00007FF757B74BDC]
	(No symbol) [0x00007FF757B75C64]
	GetHandleVerifier [0x00007FF75809E16B+3695259]
	GetHandleVerifier [0x00007FF7580F6737+4057191]
	GetHandleVerifier [0x00007FF7580EE4E3+4023827]
	GetHandleVerifier [0x00007FF757DC04F9+689705]
	(No symbol) [0x00007FF757C9C048]
	(No symbol) [0x00007FF757C98044]
	(No symbol) [0x00007FF757C981C9]
	(No symbol) [0x00007FF757C888C4]
	BaseThreadInitThunk [0x00007FFC7818259D+29]
	RtlUserThreadStart [0x00007FFC79E0AF38+40]



In [11]:
import os

# 定义文件夹路径和输出文件路径
input_folder = './tit_ab_files'
output_file = 'openreview_titles_abstracts.md'

# 打开输出文件，准备写入
with open(output_file, 'w', encoding='utf-8') as outfile:
    # 遍历文件夹中的所有文件
    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)

        # 确保文件是一个普通文件而不是目录
        if os.path.isfile(file_path):
            try:
                # 打开并读取文件内容
                with open(file_path, 'r', encoding='utf-8') as infile:
                    content = infile.read()

                    # 将文件内容写入输出文件
                    outfile.write(f"### {filename}\n\n")
                    outfile.write(content + "\n\n")
            except Exception as e:
                print(f"无法读取文件 {filename}: {e}")

print("文件内容已成功保存到 openreview_titles_abstracts.md")


文件内容已成功保存到 openreview_titles_abstracts.md
