In [80]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import time
import datetime as dt

In [81]:
# set up splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\T\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [82]:
# visit web page
page_url = "https://www.federalreserve.gov/newsevents/pressreleases.htm"
browser.visit(page_url)
time.sleep(10)

press_release_list = []

# iterate through all pages
for i in range(10):
    
    # create html object and scrape into soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    press_releases = soup.find_all("div", "col-xs-9 col-md-10 eventlist__event")

    for release in press_releases:
        
        # scrape url, title, and category of press release
        release_url = 'https://www.federalreserve.gov' + release.a['href']
        title = release.a.text.strip()
        category = release.b.text.strip()
        
        #put variables in dictionary
        release_dict = {"url": release_url,
                        "title": title,
                        "category": category
                       }

        press_release_list.append(release_dict)
    
    # click the 'Next' button on each page
    try:
        browser.links.find_by_partial_text('Next').click()
          
    except:
        print("scraping complete")

In [60]:
len(press_release_list)

200

In [61]:
press_release_list[0]

{'url': 'https://www.federalreserve.gov/newsevents/pressreleases/monetary20210625a.htm',
 'title': 'Federal Reserve Board announces it will extend for a final time its Paycheck Protection Program Liquidity Facility, or PPPLF, by an additional month to July 30, 2021',
 'category': 'Monetary Policy'}

In [107]:
press_release_details = []

# use urls scraped in previous cell to pull
for i in range(len(press_release_list)):
    link = press_release_list[i]["url"]
    
    # visit web page
    browser.visit(link)
    #time.sleep(1)

    # create html object and scrape into soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # scrape date and time of press release
    release_date = soup.find("div", class_="heading col-xs-12 col-sm-8 col-md-8").find("p", class_="article__time").text.strip()
    release_time = soup.find("div", class_="heading col-xs-12 col-sm-8 col-md-8").find("p", class_="releaseTime").text.strip()
    
    # strip off part of string not related to time. replace a.m./p.m. with am/pm for strptime to work
    if release_time.endswith('EDT') or release_time.endswith('EST'):
        release_time = release_time[-14:].strip()
        release_time = release_time.replace(".", "")
    else:
        release_time = None
    
    if release_time is not None:
        
        # concatenate date and time
        release_datetime = release_date + " " + release_time
        
        # convert to datetime. change EST to "-0500" for strptime to work
        if release_datetime[-3:] == 'EST':
            release_datetime = dt.datetime.strptime(release_datetime[:-3] + "-0500", "%B %d, %Y %I:%M %p %z")
        
        # convert to datetime. change EST to "-0400" for strptime to work
        elif release_datetime[-3:] == 'EDT':
            release_datetime = dt.datetime.strptime(release_datetime[:-3] + "-0400", "%B %d, %Y %I:%M %p %z")
        
        release_datetime = release_datetime.replace(minute=30)
    
    else:
        release_datetime = dt.datetime.strptime(release_date, "%B %d, %Y")
    
    # find press release content paragraphs
    paragraphs = soup.find("div", class_="col-xs-12 col-sm-8 col-md-8").find_all("p")
    text = ""
    
    # loop through paragraphs and concatenate
    for p in paragraphs:
        if len(text) == 0:
            text = p.text.strip()
        else:
            text = text + "\n" + p.text.strip()
    
    # put variables in dictionary
    details_dict = {"url": link,
                    "date": release_date,
                    "time": release_time,
                    "datetime_on_30_min": release_datetime,
                    "text": text
                   }
    
    press_release_details.append(details_dict)


In [108]:
len(press_release_details)

200

In [109]:
press_release_details[0]

{'url': 'https://www.federalreserve.gov/newsevents/pressreleases/monetary20210625a.htm',
 'date': 'June 25, 2021',
 'time': '3:30 pm EDT',
 'datetime_on_30_min': datetime.datetime(2021, 6, 25, 15, 30, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))),
 'text': "The Federal Reserve Board on Friday announced it will extend for a final time its Paycheck Protection Program Liquidity Facility, or PPPLF, by an additional month to July 30, 2021. The extension is being made as an operational accommodation to allow additional processing time for banks, community development financial institutions, and other financial institutions to pledge to the facility any Paycheck Protection Program, or PPP, loans approved by the Small Business Administration through the June 30 expiration of the PPP program.\nThe PPPLF extends term credit to financial institutions making PPP loans, accepting the PPP loans as collateral. The liquidity provided by the PPPLF bolsters the effectiveness of t

In [110]:
df_press_releases = pd.DataFrame(press_release_list)

df_press_releases.head()

Unnamed: 0,url,title,category
0,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board announces it will extend...,Monetary Policy
1,https://www.federalreserve.gov/newsevents/pres...,Agencies release list of distressed or underse...,Banking and Consumer Regulatory Policy
2,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board releases results of annu...,Banking and Consumer Regulatory Policy
3,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve announces it will continue its...,Monetary Policy
4,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board extends comment period o...,Banking and Consumer Regulatory Policy


In [111]:
df_press_release_content = pd.DataFrame(press_release_details)

df_press_release_content.head()

Unnamed: 0,url,date,time,datetime_on_30_min,text
0,https://www.federalreserve.gov/newsevents/pres...,"June 25, 2021",3:30 pm EDT,2021-06-25 15:30:00-04:00,The Federal Reserve Board on Friday announced ...
1,https://www.federalreserve.gov/newsevents/pres...,"June 25, 2021",3:00 pm EDT,2021-06-25 15:30:00-04:00,The Board of Governors of the Federal Reserve ...
2,https://www.federalreserve.gov/newsevents/pres...,"June 24, 2021",4:30 pm EDT,2021-06-24 16:30:00-04:00,The Federal Reserve Board on Thursday released...
3,https://www.federalreserve.gov/newsevents/pres...,"June 23, 2021",9:45 am EDT,2021-06-23 09:30:00-04:00,The Federal Reserve on Wednesday announced tha...
4,https://www.federalreserve.gov/newsevents/pres...,"June 22, 2021",10:30 am EDT,2021-06-22 10:30:00-04:00,The Federal Reserve Board announced on Tuesday...


In [112]:
df_merged = df_press_releases.merge(df_press_release_content, how="inner", on="url")
df_merged.head()

Unnamed: 0,url,title,category,date,time,datetime_on_30_min,text
0,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board announces it will extend...,Monetary Policy,"June 25, 2021",3:30 pm EDT,2021-06-25 15:30:00-04:00,The Federal Reserve Board on Friday announced ...
1,https://www.federalreserve.gov/newsevents/pres...,Agencies release list of distressed or underse...,Banking and Consumer Regulatory Policy,"June 25, 2021",3:00 pm EDT,2021-06-25 15:30:00-04:00,The Board of Governors of the Federal Reserve ...
2,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board releases results of annu...,Banking and Consumer Regulatory Policy,"June 24, 2021",4:30 pm EDT,2021-06-24 16:30:00-04:00,The Federal Reserve Board on Thursday released...
3,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve announces it will continue its...,Monetary Policy,"June 23, 2021",9:45 am EDT,2021-06-23 09:30:00-04:00,The Federal Reserve on Wednesday announced tha...
4,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board extends comment period o...,Banking and Consumer Regulatory Policy,"June 22, 2021",10:30 am EDT,2021-06-22 10:30:00-04:00,The Federal Reserve Board announced on Tuesday...


In [113]:
df_merged.to_csv('fed_press_releases.csv', index=False)