# FOMC议息决议历史资料爬取

@author: WUBIN ZHANG

@date: Oct 15, 2024

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import os, sys
import json
import re

sys.path.append('../')
from utils.file_saver import json_update

# 设置 WebDriver
options = webdriver.ChromeOptions()
driver = webdriver.Chrome()

# 导航到初始页面
url = "https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm"
driver.get(url)


def fetch_meeting_content(driver, statement_url: str):
    try:
        driver.get(statement_url)

        # 提取决议文本
        content_elements = driver.find_elements(
            By.XPATH, "//div[@id='article']/div[@class='col-xs-12 col-sm-8 col-md-8']/p"
        )
        content = "\n\n".join([ele.text for ele in content_elements])
        driver.back()
        return content
    except Exception as e:
        print(f"Error processing {statement_url}: {e}")
        return ""

# 定位到包含每年链接的部分
years_links = driver.find_elements(
    By.XPATH,
    "//div[@id='article']/div/div[@class='panel panel-default panel-padded']/ul/li/a[@href]",
)

# 初始化数据字典
all_data = {}

def safe_get_text(webelement):
    try:
        return webelement.text.strip()
    except Exception as e:
        print(
            f"Error {repr(e)} when get text.",
        )
        return ""

year_urls = {}
for link in years_links:
    # 对每年的链接进行操作
    year_text = safe_get_text(link)
    if year_text.isdigit() and int(year_text) >= 2014 and int(year_text)<=2019:
        year_url = link.get_attribute('href')
        year_urls[year_text] = year_url


for year_text, year_url in year_urls.items():
    driver.get(year_url)
    # 获取所有会议材料的面板控件
    meeting_panels = driver.find_elements(
        By.XPATH,
        "//div[@id='article']/div[contains(@class, 'panel panel-default')]", 
    )
    print(f"{year_text}: {len(meeting_panels)} meetings was found.")
    # 遍历每个会议材料面板
    year_data = []
    for panel in meeting_panels:
        try:
            # 1) 提取会议标题中的日期
            panel_title = panel.find_element(
                By.XPATH, './/h5'
            ).text
            # 后处理
            date = panel_title.replace(' Meeting - ', ', ')
            print(f"{date} was found.")
            # 2) 提取会议决议的正文
            url_elements = panel.find_elements(
                By.XPATH,
                ".//div/div/p/a[@href and text()='Statement']",  # [@class='col-xs-12 col-md-6']
            )
            # print(f"{len(url_elements)} statements was found.")
            for ele in url_elements:
                if ele is None or ele.text != 'Statement': 
                    continue
                statement_url = ele.get_attribute('href')
                content = fetch_meeting_content(driver, statement_url)
                year_data.append({"date": date, "content": content}) 
        except Exception as e:
            print(repr(e))
            year_data.append({"date": date, "content": ""})
            
    print(f"{year_text}: {len(year_data)} was collected.")
    print(f"{year_text} finished.\n" + '-'*60 + '\n')
    all_data[year_text] = year_data
    driver.back()
        
print("All finished.")
# 关闭 WebDriver
driver.quit()

# 保存数据
# with open("fomc_statements.json", "w") as f:
#     json.dump(all_data, f, ensure_ascii=False, indent=4)
# json_update("fomc_statements.json", all_data, tag_fields='date')

2018: 8 meetings was found.
January 30-31, 2018 was found.
March 20-21, 2018 was found.
May 1-2, 2018 was found.
June 12-13, 2018 was found.
Jul/Aug 31-1, 2018 was found.
September 25-26, 2018 was found.
November 7-8, 2018 was found.
December 18-19, 2018 was found.
2018: 8 was collected.
2018 finished.
------------------------------------------------------------

2017: 8 meetings was found.
Jan/Feb 31-1, 2017 was found.
March 14-15, 2017 was found.
May 2-3, 2017 was found.
June 13-14, 2017 was found.
July 25-26, 2017 was found.
September 19-20, 2017 was found.
Oct/Nov 31-1, 2017 was found.
December 12-13, 2017 was found.
2017: 8 was collected.
2017 finished.
------------------------------------------------------------

2016: 8 meetings was found.
January 26-27, 2016 was found.
March 15-16, 2016 was found.
April 26-27, 2016 was found.
June 14-15, 2016 was found.
July 26-27, 2016 was found.
September 20-21, 2016 was found.
November 1-2, 2016 was found.
December 13-14, 2016 was found.
201

In [5]:
# with open("fomc_statements.json", "w") as f:
#     json.dump(all_data, f, ensure_ascii=False, indent=4)
json_update("fomc_statements.json", all_data, **dict(tag_fields="date"))

In [3]:
        # # 获取当年所有的决议链接
        # statements = driver.find_elements(
        #     By.XPATH, "//*[@id='article']/div/div/div/p/a[@href]"
        # )
        
        # year_data = []
        
        # for statement in statements:
        #     statement_text = statement.text.strip()
        #     if statement_text != "Statement":
        #         continue
        #     statement_url = statement.get_attribute('href')
        #     try:
        #         driver.get(statement_url)
                    
        #         # 提取决议文本
        #         content = driver.find_element(By.XPATH, "//div[@class='col-md-9']/p").text
                
        #         # 添加到年份数据中
        #         year_data.append({"date": statement_text, "content": content})
        #     except Exception as e:
        #         print(f"Error processing {statement_url}: {e}")