<!-- # 纽约联储主席讲话数据爬取

@author : zhangwubin / 01208663

@date: Oct. 21, 2024 -->

# 纽约联储 数据爬取

@author: zhangwubin

@date: 2024.11.22

In [1]:
import os
import sys

sys.path.append('..')
sys.path.append("../../")

import json
import pandas as pd
from datetime import datetime
from typing import List, Optional
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException


from utils.file_saver import json_update, json_load
from utils.common import parse_datestring

In [15]:
# Set the date range for scraping
start_date = datetime(2024, 10, 15)
end_date = datetime.now()

In [21]:
# Function to get the most recent speech date from existing files
def get_most_recent_speech_date():
    all_speeches_file = '../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches_all.json'
    
    if os.path.exists(all_speeches_file):
        with open(all_speeches_file, 'r', encoding='utf-8') as f:
            speeches = json.load(f)
            if speeches:  # Check if the file is not empty
                last_speech = speeches[0]
                # datetime.strptime(last_speech['date'], '%B %d, %Y')
                result = parse_datestring(last_speech['date'])
                return result
    
    return start_date

In [17]:
# automatically downloads and sets up ChromeDriver
driver = webdriver.Chrome()

In [18]:
# Navigate to the New York Fed speeches page
URL = "https://www.newyorkfed.org/newsevents/speeches/index"
driver.get(URL)
print(driver.title)

$name - FEDERAL RESERVE BANK of NEW YORK


## 搜集所有的讲话数据信息

In [22]:
def get_speech_infos(last_names: Optional[List[str]] = None):
    speech_urls = []
    try:
        driver.get(URL)
        # Wait for the table to be present
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "newsTable"))
        )
        
        rows = table.find_elements(By.TAG_NAME, "tr")
        
        for row in rows:
            if "yrHead" in row.get_attribute("class"):
                continue
            
            try:
                columns = row.find_elements(By.TAG_NAME, "td")
                if len(columns) < 2: 
                    continue
                date_div = columns[0].find_element(By.TAG_NAME, "div")
                date = date_div.text.strip().split("==")[0].strip()  # Extract date and remove any extra text
                link_elem = row.find_element(By.TAG_NAME, "a")
                href = link_elem.get_attribute("href")
                # Check if the speech is by one of the specified speakers
                title = link_elem.text.strip()
                if last_names:
                    speaker_last_name = title.split(':')[0].strip()
                    if speaker_last_name in last_names:
                        speech_urls.append({"url": href, "date": date, "title": title})
                else:
                    speech_urls.append({"url": href, "date": date, "title": title})
            except NoSuchElementException:
                print(f"No speech link found in row: {row.text}")
                continue
        
        print(f"Collected {len(speech_urls)} speech links.")
        return speech_urls
    
    except Exception as e:
        print(f"An error occurred while collecting speech links: {str(e)}")
        return speech_urls

In [23]:
# 获取最新的讲话信息
# Williams and Dudley are the presidents of the New York Fed from 2020 to 2024
most_recent_date = get_most_recent_speech_date()
# most_recent_date = parse_date(most_recent_date)
print("most_recent_date: {}".format(most_recent_date))
# last_names_to_include = ["Williams", "Dudley", "Geithner", "Stewart", "Geithner"]
speech_infos = get_speech_infos()
speech_infos

most_recent_date: 2024-10-15 00:00:00
No speech link found in row: ARCHIVE
Collected 637 speech links.


[{'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/nea241119',
  'date': 'Nov 19, 2024',
  'title': 'Neal: Foreign Exchange Market Structure: The Land of a Thousand Lakes'},
 {'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/wil241115',
  'date': 'Nov 15, 2024',
  'title': 'Williams: 100 Years at 33 Liberty Street'},
 {'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/wil241114',
  'date': 'Nov 14, 2024',
  'title': 'Williams: X Marks the Spot: Making Missing Markets'},
 {'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/ath241112',
  'date': 'Nov 13, 2024',
  'title': 'Athreya: Disinflation ... and Whose Inflation?'},
 {'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/per241112',
  'date': 'Nov 12, 2024',
  'title': 'Perli: Facing Quarter-End Pressures: Understanding the Repo Market and Federal Reserve Tools'},
 {'url': 'https://www.newyorkfed.org/newsevents/speeches/2024/nea241015',
  'date': 'Oct 15, 2024',
  'title': 'Neal: Centr

In [24]:
def extract_speech_content(url):
    try:
        driver.get(url)
        title = driver.find_element(By.CLASS_NAME, "ts-article-title").text.strip()
        
        # Wait for the content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "container_12"))
        )
        # Extract the "Posted" date
        contact_info_elements = driver.find_elements(
            By.XPATH, "//body/div/div/div[@class='ts-contact-info']"
        )
        if len(contact_info_elements) >=1:
            # 日期
            date_elem = contact_info_elements[0]
            date_text = date_elem.text.strip()
            posted_date = [line for line in date_text.split('\n') if 'Posted' in line]
            date = posted_date[0] if posted_date else date_text.split('\n')[0]
            # 演讲人
            speaker_title = contact_info_elements[1].text
            splits = speaker_title.split(',')
            speaker = splits[0].strip() if len(splits) >0 else "Unknown"
        else:
            date = 'Unknown'
            speaker = 'Unknown'

        content_elem = driver.find_element(By.CLASS_NAME, "ts-article-text")
        paragraphs = content_elem.find_elements(By.TAG_NAME, "p")
        content = "\n\n".join([p.text for p in paragraphs if p.text.strip()])
        
        return {
            "title": title,
            "date": date,
            "speaker": speaker,
            "url": url,
            "content": content.strip()
        }
    except TimeoutException as e:
        print(f"Timeout error extracting content from {url}: {str(e)}")
    except WebDriverException as e:
        print(f"WebDriver error extracting content from {url}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error extracting content from {url}: {str(e)}")

test_url = "https://www.newyorkfed.org/newsevents/speeches/2006/gei060405"
# extract_speech_content(test_url)

In [None]:
def save_speeches(speeches, year=None):
    folder_name = "../data/fed_speeches/newyork_fed_speeches"
    os.makedirs(folder_name, exist_ok=True)

    if year is not None:
        filename = f"newyork_fed_speeches_{year}.json"
        file_path = os.path.join(folder_name, filename)
        # with open(file_path, "w") as f:
        #     json.dump(speeches, f, indent=2)
        json_update(file_path, speeches)
        print(f"Saved {len(speeches)} speeches for {year} to {file_path}")
    else:
        filename = "newyork_fed_speeches_all.json"
        file_path = os.path.join(folder_name, filename)
        # with open(file_path, "w") as f:
        #     json.dump(speeches, f, indent=2)
        json_update(file_path, speeches)
        print(f"Saved {len(speeches)} speeches to {file_path}")

In [None]:
# Main scraping process
def scrape_speeches(start_date: datetime, speech_infos: list):
    speeches = []
    speeches_by_year = {}
    current_year = None

    for speech in speech_infos:
        speech_date = parse_datestring(speech['date'])
        if speech_date and speech_date >= start_date:
            full_speech_data = extract_speech_content(speech['url'])
            if full_speech_data:
                full_speech_data['date'] = speech['date']  # Use the date from the index page
                speeches.append(full_speech_data)
                
                year = speech_date.year
                if year != current_year:
                    if current_year:
                        save_speeches(speeches_by_year[current_year], current_year)
                    current_year = year
                    speeches_by_year[year] = []
                speeches_by_year[year].append(full_speech_data)
                print(f"Scraped speech: {speech['date']} - {full_speech_data['title']}")
        elif speech_date is None:
            print(f"Skipping speech due to invalid date: {speech_infos['url']}")
        else:
            print("Reached speeches older than our start date, stop here")
            break
    
    # Save the last year's speeches if any are left
    if current_year and speeches_by_year[current_year]:
        save_speeches(speeches_by_year[current_year], current_year)

    # Save all speeches
    save_speeches(speeches)

    return speeches, speeches_by_year

speeches, speeches_by_year = scrape_speeches(most_recent_date, speech_infos)

Scraped speech: Nov 19, 2024 - Foreign Exchange Market Structure: The Land of a Thousand Lakes
Scraped speech: Nov 15, 2024 - 100 Years at 33 Liberty Street
Scraped speech: Nov 14, 2024 - X Marks the Spot: Making Missing Markets
Scraped speech: Nov 13, 2024 - Disinflation ... and Whose Inflation?
Scraped speech: Nov 12, 2024 - Facing Quarter-End Pressures: Understanding the Repo Market and Federal Reserve Tools


2024-11-22 16:06:14,715 - speech_scraper - ERROR - JSON file was not existed. New file newyork_fed_speeches\newyork_fed_speeches_2024.json created.
2024-11-22 16:06:14,719 - speech_scraper - ERROR - JSON file was not existed. New file newyork_fed_speeches\newyork_fed_speeches_all.json created.


Scraped speech: Oct 15, 2024 - Central Clearing in the U.S. Treasury Market: The Why and the How
Reached speeches older than our start date, stop here
Error. file newyork_fed_speeches\newyork_fed_speeches_2024.json not found.
Saved 6 speeches for 2024 to newyork_fed_speeches\newyork_fed_speeches_2024.json
Error. file newyork_fed_speeches\newyork_fed_speeches_all.json not found.
Saved 6 speeches to newyork_fed_speeches\newyork_fed_speeches_all.json


In [27]:
# Close the browser
driver.quit()

In [None]:
# # 将 records格式转写为 dict格式
# speeches_filename = "../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches.json"
# speeches = json_load(speeches_filename)
# speech_dict = {}
# for speech in speeches:
#     year = pd.to_datetime(speech['date']).strftime('%Y')
#     speech_dict.setdefault(year, []).append(speech)
# json_update("../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches.json", speech_dict)

Error. file ../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches.json not found.


2024-11-22 17:20:23,864 - speech_scraper - ERROR - JSON file was not existed. New file ../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches.json created.


In [None]:
# for year in range(2017, 2025):
#     single_year_speeches = json_load(
#         f"../data/fed_speeches/newyork_fed_speeches/newyork_speeches_{year}.json"
#     )
#     json_update(
#         "../data/fed_speeches/newyork_fed_speeches/newyork_fed_speeches.json",
#         {f"{year}": single_year_speeches},
#     )