# 美联储议息决议内容爬取

prompt:
```shell
下面这个网站是美联储FOMC的会议网址：https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm
该网址的正文部分按照自然年用Panel控件发布了每年的美联储议息会议的公告，包括会议月份和日期、议息决议Statement和会议备忘录Minutes。其中Statement给出了PDF、HTML以及Implementation Note三个链接，Minutes则只给出了PDF和HTML链接以及发布时间。注意，有的决议可能不披露上述信息，则仅保存月份、日期和决议名称等信息。
我想要按照年份，爬取每一条议息决议数据，包括月份、日期、statement中HTML的网址和内容，以及会议备忘录minutes中的内容，每一条决议数据都存储为一个字典，最后按年份来保存所有数据为json文件。
记载每一条议息决议的字典应当包含如下键，含义如下：
year: 决议年份，为str类型，如2024
month: 决议月份，为str类型，如Feb
date: 决议日期，为str类型，如17-19
statement: 议息决议，为字典类型。包括html和implementation note两个键，每个键对应一个字典，分别包含href, title, date, content四个键。
minutes: 会议备忘录，同样为字典类型。包括html一个键，该键包含一个字典，包含href, title, date, content四个键。

请帮我开发python代码，爬虫工具可以使用selenium和beautifulsoup4来实现。
```

In [3]:
from datetime import datetime

def parse_date(date_string):
    try:
        return datetime.strptime(date_string.strip(), "%b %d, %Y")
    except ValueError:
        print(f"Error parsing date: {date_string}")
        return None
    
dates = []
for year_str in ["Sep 23, 2024", "Oct 23, 1995"]:
    date = parse_date(year_str)
    print(date)
    dates.append(date)
print(max(dates))

2024-09-23 00:00:00
1995-10-23 00:00:00
2024-09-23 00:00:00


In [1]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# 设置 WebDriver
driver = webdriver.Chrome()  # 确保已经配置好 ChromeDriver 路径
url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'
driver.get(url)

# 等待页面加载完成
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'fomc-meeting')))

# 获取页面源码
html_content = driver.page_source

# 解析 HTML
soup = BeautifulSoup(html_content, 'html.parser')

In [2]:
# def fetch_statement_html():

In [3]:
# 查找所有 class 为 fomc-meeting 的元素
meetings = soup.find_all(class_="fomc-meeting")

# 初始化数据结构
all_data = []

for meeting in meetings:
    # 获取日期
    month = meeting.find(class_="fomc-meeting__month").strong.text.strip()
    date = meeting.find(class_="fomc-meeting__date").text.strip()

    # 提取声明（statement）部分的链接
    statement_div = meeting.find(class_="col-xs-12 col-md-4 col-lg-2")
    if statement_div:
        statement_links = statement_div.find_all("a")
        statement_data = {"html": {}, "implementation_note": {}}

        for link in statement_links:
            text = link.text.strip()
            href = link["href"]

            if "HTML" in text:
                statement_data["html"]["href"] = href
                statement_data["html"]["title"] = text
                # statement_data["html"]["date"] = date
            elif "Implementation Note" in text:
                statement_data["implementation_note"]["href"] = href
                statement_data["implementation_note"]["title"] = text
                # statement_data["implementation_note"]["date"] = date

        # 访问每个声明页面并提取内容
        if statement_data["html"]:
            try:
                driver.get(statement_data["html"]["href"])
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
                article_html = driver.page_source
                article_soup = BeautifulSoup(article_html, "html.parser")
                statement_data["html"]["content"] = (
                    article_soup.find("article").get_text().strip()
                )
            except:
                statement_data["html"]["content"] = None

        if statement_data["implementation_note"]:
            try:
                driver.get(statement_data["implementation_note"]["href"])
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
                article_html = driver.page_source
                article_soup = BeautifulSoup(article_html, "html.parser")
                statement_data["implementation_note"]["content"] = (
                    article_soup.find("article").get_text().strip()
                )
            except:
                statement_data["implementation_note"]["content"] = None
    else:
        statement_data = {}

    # 提取会议备忘录（minutes）部分的链接
    minutes_div = meeting.find(class_="col-xs-12 col-md-4 col-lg-4")
    if minutes_div:
        minutes_links = minutes_div.find_all("a")
        minutes_data = {"html": {}}

        for link in minutes_links:
            text = link.text.strip()
            href = link["href"]

            if "Minutes" in text:
                minutes_data["html"]["href"] = href
                minutes_data["html"]["title"] = text
                # minutes_data["html"]["date"] = date

        # 访问每个会议备忘录页面并提取内容
        if minutes_data["html"]:
            driver.get(minutes_data["html"]["href"])
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
            article_html = driver.page_source
            article_soup = BeautifulSoup(article_html, "html.parser")
            minutes_data["html"]["content"] = (
                article_soup.find("article").get_text().strip()
            )
    else:
        minutes_data = {}

    # 组装最终数据
    final_data = {
        "month": month,
        "date": date,
        "statement": statement_data,
        "minutes": minutes_data,
    }


    all_data.append(final_data)

# 将数据保存为 JSON 文件
with open("fomc_meeting.json", "w") as file:
    json.dump(all_data, file, indent=4)

# 关闭浏览器
# driver.quit()

print("Data saved to separate JSON files by year.")

Data saved to separate JSON files by year.


In [7]:
# Locate the start and end date input fields and set the desired dates
# meetings = soup.find_all(class_="fomc-meeting")

articles = driver.find_element(By.ID, "article")
panels = articles.find_elements(By.CLASS_NAME, "panel")
panels

# meeting_infos = []
# for panel in panels:
#     header = panel.find_element(By.CLASS_NAME, "panel-heading").text
#     print(f"Year: {header}\n" + "-" * 100)
#     # 获取议息会议控件
#     meetings = panel.find_elements(By.CLASS_NAME, "fomc-meeting")
#     # 获取议息会议信息
#     info = extract_meeting_infos(meetings)
#     meeting_infos.append({"Year": header, "Info": info})

# meeting_infos


[<selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace81cd0a0", element="f.76C63AFA6F8289D2C89D22AF3A53E32E.d.C7C5664ECE4B61BFDEBD83976DF2C9DE.e.31")>,
 <selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace81cd0a0", element="f.76C63AFA6F8289D2C89D22AF3A53E32E.d.C7C5664ECE4B61BFDEBD83976DF2C9DE.e.32")>,
 <selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace81cd0a0", element="f.76C63AFA6F8289D2C89D22AF3A53E32E.d.C7C5664ECE4B61BFDEBD83976DF2C9DE.e.33")>,
 <selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace81cd0a0", element="f.76C63AFA6F8289D2C89D22AF3A53E32E.d.C7C5664ECE4B61BFDEBD83976DF2C9DE.e.34")>,
 <selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace81cd0a0", element="f.76C63AFA6F8289D2C89D22AF3A53E32E.d.C7C5664ECE4B61BFDEBD83976DF2C9DE.e.35")>,
 <selenium.webdriver.remote.webelement.WebElement (session="16c28a8880cd44364efa35ace