In [45]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_speech_text(url):
    # Fetch the content of the speech page
    response = requests.get(url)
    # Make sure the request was successful
    if response.status_code == 200:
        speech_soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the div with the specified class
        content_div = speech_soup.find('div', class_='col-xs-12 col-sm-8 col-md-8')
        
        # If the div is found, extract all text from p tags within this div
        if content_div:
            p_tags = content_div.find_all('p')
            speech_text = ' '.join(p.get_text(strip=True) for p in p_tags if p.get_text(strip=True))
        else:
            # If the div was not found, return an empty string or an error message
            speech_text = ""
        return speech_text
    else:
        print(f"Failed to retrieve {url}")
        return ""

# Initialize a DataFrame to hold all speech details from all years
all_speeches_df = pd.DataFrame()

for year in range(2011, 2025):
    # URL of the page you want to scrape
    url = f'https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm'
    
    # Fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize a list to hold all speech details for the current year
    speech_details = []
    
    # Find all the event time and event detail elements
    event_times = soup.find_all('div', class_='col-xs-3 col-md-2 eventlist__time')
    event_details = soup.find_all('div', class_='col-xs-9 col-md-10 eventlist__event')
    
    # Loop through each event time and corresponding event detail
    for time_div, detail_div in zip(event_times, event_details):
        # Extract the time
        time = time_div.find('time').text.strip()

        # Extract the title and URL
        title_tag = detail_div.find('a')
        title = title_tag.text.strip() if title_tag else 'N/A'
        speech_url = f"https://www.federalreserve.gov{title_tag['href']}" if title_tag else 'N/A'

        # Extract the news speaker
        speaker_tag = detail_div.find('p', class_='news__speaker')
        speaker = speaker_tag.text.strip() if speaker_tag else 'N/A'

        # Extract the location (the first <p> tag after the speaker)
        location_tag = speaker_tag.find_next_sibling('p')
        location = location_tag.text.strip() if location_tag else 'N/A'

        # Append the details to the list
        speech_details.append({
            'time': time,
            'title': title,
            'news_speaker': speaker,
            'location': location,
            'url': speech_url
        })
    
    # Create a DataFrame from the list of speech details for the current year
    df_speeches = pd.DataFrame(speech_details)
    
    # Concatenate the df_speeches of the current year to the all_speeches_df
    all_speeches_df = pd.concat([all_speeches_df, df_speeches], ignore_index=True)

for year in range(2006, 2011):
    # URL of the page you want to scrape
    url = f'https://www.federalreserve.gov/newsevents/speech/{year}speech.htm'
    
    # Fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize a list to hold all speech details for the current year
    speech_details = []
    
    # Find all the event time and event detail elements
    event_times = soup.find_all('div', class_='col-xs-3 col-md-2 eventlist__time')
    event_details = soup.find_all('div', class_='col-xs-9 col-md-10 eventlist__event')
    
    # Loop through each event time and corresponding event detail
    for time_div, detail_div in zip(event_times, event_details):
        # Extract the time
        time = time_div.find('time').text.strip()

        # Extract the title and URL
        title_tag = detail_div.find('a')
        title = title_tag.text.strip() if title_tag else 'N/A'
        speech_url = f"https://www.federalreserve.gov{title_tag['href']}" if title_tag else 'N/A'

        # Extract the news speaker
        speaker_tag = detail_div.find('p', class_='news__speaker')
        speaker = speaker_tag.text.strip() if speaker_tag else 'N/A'

        # Extract the location (the first <p> tag after the speaker)
        location_tag = speaker_tag.find_next_sibling('p')
        location = location_tag.text.strip() if location_tag else 'N/A'

        # Append the details to the list
        speech_details.append({
            'time': time,
            'title': title,
            'news_speaker': speaker,
            'location': location,
            'url': speech_url
        })
    
    # Create a DataFrame from the list of speech details for the current year
    df_speeches = pd.DataFrame(speech_details)
    
    # Concatenate the df_speeches of the current year to the all_speeches_df
    all_speeches_df = pd.concat([all_speeches_df, df_speeches], ignore_index=True)

# Now outside the loop, add the full text of each speech to the all_speeches_df DataFrame
all_speeches_df['speech_text'] = all_speeches_df['url'].apply(extract_speech_text)

all_speeches_df

Unnamed: 0,time,title,news_speaker,location,url,speech_text
0,11/29/2011,Aggregate Demand and the Global Economic Recovery,Vice Chair Janet L. Yellen,"At the Federal Reserve Bank of San Francisco, ...",https://www.federalreserve.gov/newsevents/spee...,"Good morning. I'm delighted to return ""home"" t..."
1,11/11/2011,Pursuing Financial Stability at the Federal Re...,Vice Chair Janet L. Yellen,At the Fourteenth Annual International Banking...,https://www.federalreserve.gov/newsevents/spee...,Let me begin by thanking the Federal Reserve B...
2,11/10/2011,Town Hall Meeting Remarks,Chairman Ben S. Bernanke,At the Town Hall Meeting with Soldiers and and...,https://www.federalreserve.gov/newsevents/spee...,It is an honor and a privilege for me to join ...
3,11/9/2011,Welcoming Remarks,Chairman Ben S. Bernanke,At the Conference on Small Business and Entrep...,https://www.federalreserve.gov/newsevents/spee...,Accessible Keys for Video [Space Bar]toggles p...
4,11/9/2011,The Evolution of Capital Regulation,Governor Daniel K. Tarullo,At the Clearing House Business Meeting and Con...,https://www.federalreserve.gov/newsevents/spee...,"A little over three years ago, Lehman Brothers..."
...,...,...,...,...,...,...
1068,2/3/2010,Regulation and Its Discontents,Governor Kevin Warsh,At the New York Association for Business Econo...,https://www.federalreserve.gov/newsevents/spee...,Thank you to the New York Association for Busi...
1069,1/29/2010,Focusing on Bank Interest Rate Risk Exposure,Vice Chairman Donald L. Kohn,At the Federal Deposit Insurance Corporation's...,https://www.federalreserve.gov/newsevents/spee...,I very much appreciate Chairman Bair's invitat...
1070,1/4/2010,The Economic Outlook,Governor Elizabeth A. Duke,"At the Economic Forecast Forum, Raleigh, Nort...",https://www.federalreserve.gov/newsevents/spee...,I am very pleased to be here today to discuss ...
1071,1/3/2010,Monetary Policy and the Housing Bubble,Chairman Ben S. Bernanke,At the Annual Meeting of the American Economic...,https://www.federalreserve.gov/newsevents/spee...,The financial crisis that began in August 2007...


In [46]:
all_speeches_df.to_csv('all_speeches.csv', index=False)
