# Board of Governors演讲数据爬取

@author: Mandy Lau, Wubin Zhang

@date: Sep 25, 2024

In [17]:
import sys
sys.path.append("..")

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import time
import json
import os
import re
from datetime import datetime, timedelta

from utils.file_saver import json_update, json_dump, json_load

In [18]:
# automatically downloads and sets up ChromeDriver
driver = webdriver.Chrome()

In [19]:
# Navigate to the Federal Reserve website
# driver = webdriver.Chrome()
driver.get("https://www.federalreserve.gov/newsevents/speeches.htm")
print(driver.title)

Federal Reserve Board - Speeches of Federal Reserve Officials


In [20]:
# set the date range for scraping
start_date = datetime(2006, 1, 1)
# start_date = datetime(2024, 1, 1)
# end_date = datetime.now()
end_date = datetime(2012, 12, 31)

In [21]:
# Function to get the most recent speech date from existing files
def get_most_recent_speech_date(start_date, all_speeches_file: str, region):
    """获取已存档文件中最新的演讲日期

    Args:
        start_date (str): 开始时间
        all_speeches_file (str): 存档文件路径
        region (_type_): _description_

    Returns:
        _type_: _description_
    """
    original_start_date = start_date  # original start date
    if os.path.exists(all_speeches_file):
        with open(all_speeches_file, 'r', encoding='utf-8') as f:
            speeches = json.load(f)
            if speeches: # Check if the file is not empty
                last_speech = speeches[0]
                most_recent_date = datetime.strptime(last_speech['date'], '%B %d, %Y')  if region == 'bog' else datetime.strptime(last_speech['date'], '%b %d, %Y')
                return most_recent_date
    
    return original_start_date

In [22]:
# Get the most recent speech date
all_speeches_file = '../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_tdy.json'
# most_recent_date = get_most_recent_speech_date(start_date, all_speeches_file, 'bog')
most_recent_date = start_date
print(f"Starting scrape from: {most_recent_date.strftime('%Y-%m-%d')}")

Starting scrape from: 2006-01-01


In [23]:
# Locate the start and end date input fields and set the desired dates
start_date_elem = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(2) > div > div:nth-child(1) > input")
end_date_elem = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(2) > div > div:nth-child(3) > input")
# Clear existing dates
start_date_elem.clear()
end_date_elem.clear()
# set the dates
start_date_elem.send_keys(most_recent_date.strftime('%Y-%m-%d'))
end_date_elem.send_keys(end_date.strftime('%Y-%m-%d'))
print("Date range set: {} to {}".format(most_recent_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')))

Date range set: 2006-01-01 to 2012-12-31


In [24]:
speaker_chechboxes = driver.find_elements(
    by=By.XPATH, value="//content/div/div/div/form/div/div/label/input"
)

In [25]:
# Locate the checkboxes for the desired speakers
powell = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(1) > label > input")
jefferson = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(2) > label > input")
barr = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(3) > label > input")
bowman = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(4) > label > input")
cook = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(5) > label > input")
kugler = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(6) > label > input")
waller = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(7) > label > input")
former = driver.find_element(By.CSS_SELECTOR, "#content > div.row.ng-scope > div > div.col-xs-12.col-sm-8.col-md-10.angularEvents.ng-scope > form > div:nth-child(4) > div:nth-child(8) > label > input")
# click the checkboxes for the desired speakers
powell.click()
jefferson.click()
barr.click()
bowman.click()
cook.click()
kugler.click()
waller.click()
former.click()

In [26]:
# click the search button to filter the speeches
search_button = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/div[1]/form/div[5]")
search_button.click()
time.sleep(1.2)

In [28]:
def get_speech_links_from_current_page():
    """搜集当前页面的所有演讲链接

    Returns:
        (list): 演讲链接列表
    """
    try:
        speech_rows = driver.find_elements(By.CSS_SELECTOR, "div.row.ng-scope[ng-repeat^='item in items']")
        speech_urls = []
        for row in speech_rows:
            try:
                # 演讲稿链接
                link = row.find_element(By.CSS_SELECTOR, "p.itemTitle em a")
                href = link.get_attribute('href')
                # print(href)
                title = link.text
                # print(title)
                date = row.find_element(By.CSS_SELECTOR, "time").text
                # article > div.angularEvents.items.ng-scope > div:nth-child(2) > div.col-xs-3.col-md-2.eventlist__time > time
                speech_urls.append({'href': href, 'title': title, 'date': date})
            except NoSuchElementException:
                print(f"Could not find link or date in a row. Skipping.")
        return speech_urls
    except Exception as e:
        print(f"Error extracting speech URLs from the current page: {e}")
        return []
    
# speech_infos = get_speech_links_from_current_page()
# speech_infos

In [None]:
all_speeches = []
while True:
    # Get links from the current page
    page_speeches = get_speech_links_from_current_page()
    if not page_speeches:
        print("No more speeches found or error occurred. Stopping pagination.")
        break
    
    for speech in page_speeches:
        speech_date = datetime.strptime(speech['date'], '%m/%d/%Y')
        if speech_date <= most_recent_date or speech_date >=end_date:
            # 以前的收集过就不收集了
            print("Reached speeches older than the most recent date. Stopping pagination.")
            break
        all_speeches.append(speech)
    else:
        # Try to find and click the "Next" button
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, "li.pagination-next:not(.disabled) a")
            driver.execute_script("arguments[0].click();", next_button) # Wait for the next page to load
            continue
        except NoSuchElementException:
            print("Next button not found or disabled. Reached last page.")
            break
    break
print(f"Total new speeches found: {len(all_speeches)}")

Reached speeches older than the most recent date. Stopping pagination.
Total new speeches found: 0


: 

In [13]:
def extract_speech_content(url):
    try:
        driver.get(url)
        # Wait for the content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "content"))
        )
        # Extract the speech date
        date = driver.find_element(By.CSS_SELECTOR, "#article > div.heading.col-xs-12.col-sm-8.col-md-8 > p.article__time").text
        # Extract the speech title
        title = driver.find_element(By.CSS_SELECTOR, "#article > div.heading.col-xs-12.col-sm-8.col-md-8 > h3").text
        # Extract the speaker
        speaker = driver.find_element(By.CSS_SELECTOR, "#article > div.heading.col-xs-12.col-sm-8.col-md-8 > p.speaker").text
        # Extract the speech content
        content_paragraphs = driver.find_elements(By.CSS_SELECTOR, "#article .col-xs-12.col-sm-8.col-md-8 > p")
        content = "\n\n".join([p.text for p in content_paragraphs])
        
        return {
            'title': title,
            'date': date,
            'speaker': speaker,
            'url': url,
            'content': content
        }
    except TimeoutException as e:
        print(f"Timeout error extracting content from {url}: {str(e)}")
    except WebDriverException as e:
        print(f"WebDriver error extracting content from {url}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error extracting content from {url}: {str(e)}")

In [14]:
def sanitize_filename(filename):
    # Remove invalid characters and replace spaces with underscores
    return re.sub(r'[^\w\-_\. ]', '', filename).replace(' ', '_')

# Ensure the directory exists
os.makedirs('speeches', exist_ok=True)

speeches_by_year = {}
current_year = None
speeches_with_content = []
for index, speech in enumerate(all_speeches):
    print(f"Extracting content from: {speech['title']}")
    # 获取所有演讲的正文
    speech_content = extract_speech_content(speech['href'])
    if speech_content:
        speeches_with_content.append(speech_content)
        
        # Parse the date and format it as YYYYMMDD
        date_obj = datetime.strptime(speech_content['date'], '%B %d, %Y')
        date_str = date_obj.strftime('%Y%m%d')
        year_str = date_obj.strftime('%Y')
        
        # Get the speaker's last name
        speaker_last_name = speech_content['speaker'].split()[-1]
        
        # Add speech to the corresponding year
        if year_str not in speeches_by_year:
            # Save the previous year's speeches if we are moving to a new year
            if current_year and current_year != year_str:
                # with open(
                #     f"../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_{current_year}.json",
                #     "w",
                #     encoding="utf-8",
                # ) as f:
                #     json.dump(speeches_by_year[current_year], f, ensure_ascii=False, indent=4)
                json_update(
                    filepath=f"../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_{current_year}.json",
                    obj=speeches_by_year[current_year]
                )
                print(f"Saved speeches for year {current_year} to bog_fed_speeches_{current_year}.json")
            
            speeches_by_year[year_str] = []
            current_year = year_str
        speeches_by_year[year_str].append(speech_content)
        
    time.sleep(2)  # Increased wait time between requests

# Sort speeches by date
speeches_with_content.sort(key=lambda x: datetime.strptime(x['date'], '%B %d, %Y'))
# Save the last year's speeches
if current_year:
    # with open(f'../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_{current_year}.json', 'w', encoding='utf-8') as f:
    #     json.dump(speeches_by_year[current_year], f, ensure_ascii=False, indent=4)
    json_update(
        filepath=f"../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_{current_year}.json",
        obj=speeches_by_year[current_year]
    )
    print(f"Saved speeches for year {current_year} to bog_fed_speeches_{current_year}.json")

print(f"Extracted content from {len(speeches_with_content)} speeches")

# Save all speeches to a single file as well
# with open(
#     "../data/fed_speeches/bog_fed_speeches/bog_fed_speeches_tdy.json",
#     "w",
#     encoding="utf-8",
# ) as f:
#     json.dump(speeches_with_content, f, ensure_ascii=False, indent=4)
json_update(
    filepath="../data/fed_speeches/bog_fed_speeches/bog_fed_speeches.json",
    obj=speeches_with_content
)
print("Saved all speeches to bog_fed_speeches.json")

Extracted content from 0 speeches
Saved all speeches to bog_fed_speeches.json


In [15]:
# Close the browser
driver.quit()