In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

This notebook srapes information from central bank webpages and unifies this into a concise public speeches dataset further used (performed in August 2023, may be a subject to change -> code modifications needed and its functionality is not guaranteed).

Input: 
- __"ECB_Speeches.csv"__ =  dataset that has been downloaded from ECB webpage (https://www.ecb.europa.eu/press/key/html/downloads.en.html) containing information regarding ECB public speeches

Output: 
- __"AllSpeaches.csv"__ = dataset containing raw information regarding public statements of ECB and FED to be further worked with

## Dataset regarding FED public speeches

In [9]:
all_years=pd.DataFrame(columns=['link','title','speaker','event','year'])
years=range(2018,2022)
for year in years:
    page = requests.get(f'https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    events = soup.select(".eventlist__event")
    speeches_one_year = pd.DataFrame()
    for i,speech in enumerate(events):
        speeches_one_year.at[i,'link'] = 'https://www.federalreserve.gov'+events[i].find_all('a', href=True)[0]['href']
        speeches_one_year.at[i,'title'] = events[i].text.split('\n')[2]
        if events[i].text.split('\n')[3]=='Watch Live' or events[i].text.split('\n')[3]=='Video':
            speeches_one_year.at[i,'speaker'] = events[i].text.split('\n')[4]
            speeches_one_year.at[i,'event'] = events[i].text.split('\n')[5]
        else:
            speeches_one_year.at[i,'speaker'] = events[i].text.split('\n')[3]
            speeches_one_year.at[i,'event'] = events[i].text.split('\n')[4]
        speeches_one_year.at[i,'year'] = int(year)
    all_years=all_years.append(speeches_one_year,ignore_index=True)
for i in range(1,len(all_years)):
    if ((all_years.loc[i,'year']>2005)): 
        page = requests.get(all_years.loc[i,'link'])
        soup = BeautifulSoup(page.text, 'html.parser')
        events = soup.select(".col-md-8")
        text_list = events[1].text
        text_list = text_list.replace('\xa0', ' ')
        text_list = text_list.replace('\n', ' ')
        all_years.loc[i,'text'] = text_list
all_years['date'] = all_years['link'].str.extract('(\d\d\d\d\d\d\d\d)')
all_years = all_years[~all_years['text'].isna()]
all_years['text_len'] = all_years['text'].str.split().apply(len)
all_years['location'] = all_years.event.str.split(', ').apply(lambda x: x[-1])
all_years.loc[all_years['speaker']=='Chairman  Ben S. Bernanke','speaker'] = 'Chairman Ben S. Bernanke'
all_years.loc[all_years['speaker']=='Governor Ben S. Bernanke and Vincent R. Reinhart, Director, Division of Monetary Affairs','speaker'] = 'Governor Ben S. Bernanke'
all_years.loc[all_years['speaker']=='Governor Donald L. Kohn and Brian P. Sack, Senior Economist','speaker'] = 'Governor Donald L. Kohn'
all_years.loc[all_years['speaker']=='Governor Susan Schmidt Bies','speaker'] = 'Governor Susan S. Bies'
all_years.loc[all_years['speaker']=='Vice Chair for Supervision and Chair of the Financial Stability Board Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
all_years.loc[all_years['speaker']=='Vice Chairman for Supervision and Chair of the Financial Stability Board Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
all_years.loc[all_years['speaker']=='Vice Chairman for Supervision Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
all_years.loc[all_years['speaker']=='Vice Chairman Roger W. Ferguson, Jr','speaker'] = 'Vice Chairman Roger W. Ferguson'
all_years.loc[all_years['speaker']=='Vice Chairman Roger W. Ferguson, Jr.','speaker'] = 'Vice Chairman Roger W. Ferguson'
all_years.loc[all_years['speaker']=='Chair Jerome H. Powell','speaker'] = 'Chairman Jerome H. Powell'
all_years.loc[all_years['speaker']=='Vice Chair Richard H. Clarida','speaker'] = 'Vice Chairman Richard H. Clarida'
all_years = all_years[all_years['speaker']!='Brian F. Madigan, Director, Division of Monetary Affairs']
all_years = all_years[all_years.text_len!=0]
all_years = all_years.sort_values("date")
all_years = all_years.reset_index(drop=True)
#    all_years.to_csv('FED_Speeches.csv',index=False)
all_years

  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)


Unnamed: 0,link,title,speaker,event,year,text,date,text_len,location
0,https://www.federalreserve.gov/newsevents/spee...,Vice Chairman for Supervision Randal K. Quarles,At the American Bar Association Banking Law Co...,,2018.0,It is a pleasure to be here with you at the A...,20180119,2719,
1,https://www.federalreserve.gov/newsevents/spee...,Chairman Jerome H. Powell,"At the Federal Reserve Board, Washington, D.C.",,2018.0,It is both humbling and a great privilege to ...,20180213,734,
2,https://www.federalreserve.gov/newsevents/spee...,Vice Chairman for Supervision Randal K. Quarles,"At ""10 Years after the Global Financial Crisis...",,2018.0,I am very happy to be participating in this s...,20180222,1192,
3,https://www.federalreserve.gov/newsevents/spee...,Vice Chairman for Supervision Randal K. Quarles,"At ""Promoting Sustained Growth: Policy Tension...",,2018.0,Thank you for the opportunity to take part in...,20180226,3835,
4,https://www.federalreserve.gov/newsevents/spee...,Vice Chairman for Supervision Randal K. Quarles,At the Financial Services Roundtable 2018 Spri...,,2018.0,Thank you very much for having me here at the...,20180226,792,
...,...,...,...,...,...,...,...,...,...
237,https://www.federalreserve.gov/newsevents/spee...,Watch Live,Chairman Jerome H. Powell,At the Introducing the New York Innovation Cen...,2021.0,Accessible Keys for Video [Space Ba...,20211129,484,D.C. (via webcast)
238,https://www.federalreserve.gov/newsevents/spee...,Watch Live,Governor Michelle W. Bowman,At Virtual Symposium on Indigenous Economies: ...,2021.0,Thank you to the Bank of Canada for the invit...,20211129,2016,the Reserve Bank of New Zealand
239,https://www.federalreserve.gov/newsevents/spee...,Watch Live,Vice Chairman Richard H. Clarida,"At the Federal Reserve Bank of Cleveland, Clev...",2021.0,The COVID-19 pandemic and the mitigation effo...,20211130,4258,Ohio (via livestream)
240,https://www.federalreserve.gov/newsevents/spee...,Watch Live,Governor Randal K. Quarles,"At the American Enterprise Institute, Washingt...",2021.0,When I joined the Board of Governors as Vice ...,20211202,7808,D.C.


## Dataset regarding FED press releases

In [10]:
# Send a GET request to the press releases page
all_years = pd.DataFrame(columns=['link','title','topic','year', "text"])
years = range(2018,2022)
for year in years:
    page = requests.get(f'https://www.federalreserve.gov/newsevents/pressreleases/{year}-press.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    events = soup.select(".eventlist__event")
    speeches_one_year = pd.DataFrame()
    for i,speech in enumerate(events):
        if events[i].find_all('a', href=True)[0]['href'][0] == "/":
            speeches_one_year.at[i,'link'] = 'https://www.federalreserve.gov'+events[i].find_all('a', href=True)[0]['href']
        else: 
            # only released at FED websites
            print(f"Invalid url. Details:{events[i].find_all('a', href=True)[0]['href']}")
            continue
        speeches_one_year.at[i,'title'] = events[i].text.split('\n')[1]
        speeches_one_year.at[i,'topic'] = events[i].text.split('\n')[3]
        speeches_one_year.at[i,'year'] = int(year)
        page = requests.get('https://www.federalreserve.gov'+events[i].find_all('a', href=True)[0]['href'])
        soup = BeautifulSoup(page.text, 'html.parser')
        text = soup.find("div", class_="col-xs-12 col-sm-8 col-md-8").text.replace("\n", " ").strip()
        speeches_one_year.at[i,'text'] = text
    all_years=all_years.append(speeches_one_year,ignore_index=True)
for i in range(1,len(all_years)):
    if ((all_years.loc[i,'year']>2005)): 
        page = requests.get(all_years.loc[i,'link'])
        soup = BeautifulSoup(page.text, 'html.parser')
s
all_years['date'] = all_years['link'].str.extract('(\d\d\d\d\d\d\d\d)')
all_years = all_years[~all_years['text'].isna()]
all_years['text_len'] = all_years['text'].str.split().apply(len)
all_years = all_years[all_years["date"] < "20210920"]
all_years = all_years.sort_values("date").reset_index(drop=True)
# all_years.to_csv('FED_Press',index=False)
all_years

Invalid url. Details:https://www.frbsf.org/our-district/press/news-releases/2018/mary-c-daly-named-federal-reserve-bank-of-san-francisco-president-and-chief-executive-officer/
Invalid url. Details:https://www.newyorkfed.org/newsevents/news/aboutthefed/2018/oa180403


  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)
  all_years=all_years.append(speeches_one_year,ignore_index=True)


Unnamed: 0,link,title,topic,year,text,date,text_len
0,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board announces approval of ap...,,2018.0,The Federal Reserve Board on Wednesday announc...,20180103,44
1,https://www.federalreserve.gov/newsevents/pres...,"Minutes of the Federal Open Market Committee, ...",,2018.0,The Federal Reserve Board and the Federal Open...,20180103,149
2,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board announces termination of...,,2018.0,The Federal Reserve Board on Thursday announce...,20180104,54
3,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board requests comment on prop...,,2018.0,The Federal Reserve Board on Thursday requeste...,20180104,354
4,https://www.federalreserve.gov/newsevents/pres...,Minutes of the Board's discount rate meetings ...,,2018.0,The Federal Reserve Board on Tuesday released ...,20180109,32
...,...,...,...,...,...,...,...
746,https://www.federalreserve.gov/newsevents/pres...,Federal and state financial regulatory agencie...,,2021.0,"The Office of the Comptroller of the Currency,...",20210831,633
747,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve Board announces termination of...,,2021.0,The Federal Reserve Board on Thursday announce...,20210902,55
748,https://www.federalreserve.gov/newsevents/pres...,Agencies extend comment period on proposed ris...,,2021.0,The federal bank regulatory agencies announced...,20210907,124
749,https://www.federalreserve.gov/newsevents/pres...,Federal Reserve publishes paper describing lan...,,2021.0,To support responsible innovation in the commu...,20210909,130


## Dataset regarding ECB Press Releases

In [21]:
press = pd.DataFrame(columns=['Title','Date','Topic', "Content", "Link"])
base_url = "https://www.ecb.europa.eu"
years = range(2018, 2022)
for year in years:
    page = requests.get(f"https://www.ecb.europa.eu/press/pr/date/{year}/html/index_include.en.html")
    soup = BeautifulSoup(page.text, 'html.parser')
    modified = str(soup).replace("<dt", "<div class=\"article\"> <dt").replace("<div class=\"art", "</div> <div class=\"art")
    soup1 = BeautifulSoup(modified, 'html.parser')
    curr_year = pd.DataFrame(columns=['Title','Date','Topic', "Content", 'Link'])
    for i, article in enumerate(soup1.find_all("div", class_="article")):
        curr_year.at[i,'Title'] = article.find("div", class_="title").text
        curr_year.at[i,'Date'] = datetime.strptime((article.find("dt")["isodate"]), '%Y-%m-%d').date()
        curr_year.at[i,'Topic'] = article.find("div", class_="category").text
        curr_year.at[i,'Link'] = base_url + article.find("a")["href"]
        page = requests.get(curr_year["Link"][i])
        soup = BeautifulSoup(page.text, 'html.parser')
        curr_year.at[i, "Content"] = soup.text.replace("\n", "").split("For media queries")[0].split("\r")[-1]
    press = pd.concat([press, curr_year], axis=0)
press = press[press["Date"] <= pd.Timestamp('2021-09-20')]
press.sort_values("Date").reset_index().drop('index', axis=1)
#press.to_csv("ECB_Press.csv")
press

  press = press[press["Date"] <= pd.Timestamp('2021-09-20')]


Unnamed: 0,Title,Date,Topic,Content,Link
0,Working group on euro risk-free rates seeks ma...,2018-12-20,PRESS RELEASE,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2018/h...
1,ECB decides on technical parameters for the re...,2018-12-13,PRESS RELEASE,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2018/h...
2,Monetary policy decisions,2018-12-13,MONETARY POLICY DECISION,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2018/h...
3,ECB adopts new capital key,2018-12-03,PRESS RELEASE,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2018/h...
4,"Comparison of capital key, Eurosystem key and ...",2018-12-03,PRESS RELEASE,"�d��,5����",https://www.ecb.europa.eu/press/pr/date/2018/h...
...,...,...,...,...,...
87,Monetary policy decisions,2021-01-21,MONETARY POLICY DECISION,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2021/h...
88,January 2021 euro area bank lending survey,2021-01-19,PRESS RELEASE,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2021/h...
89,The euro area bank lending survey – Fourth qua...,2021-01-19,THE EURO AREA BANK LENDING SURVEY,The euro area...,https://www.ecb.europa.eu/stats/ecb_surveys/ba...
90,ECB digital euro consultation ends with record...,2021-01-13,PRESS RELEASE,PRESS RELEASE...,https://www.ecb.europa.eu/press/pr/date/2021/h...


## Dataset regarding ECB speeches
This dataset is publicly available at ECB websites, and thus was easily downloaded at https://www.ecb.europa.eu/press/key/html/downloads.en.html

In [18]:
ECBspeeches = pd.read_csv("ECB_Speeches.csv", sep=",")
ECBspeeches = ECBspeeches[("2018-01-01" <= ECBspeeches["date"]) & (ECBspeeches["date"] <= "2021-09-21")]
ECBspeeches = ECBspeeches.sort_values("date").reset_index(drop=True).iloc[:,1:]
#ECBspeeches.to_csv("ECB_Speeches.csv")
ECBspeeches

Unnamed: 0,date,speakers,title,subtitle,contents
0,2018-01-29,Peter Praet,Maintaining price stability with unconventiona...,"Speech by Peter Praet, Member of the Executive...",Maintaining price stability with unconventio...
1,2018-01-29,Sabine Lautenschläger,Basel III - sense and sensitivity,"Speech by Sabine Lautenschläger, Member of the...",Basel III - sense and sensitivity Speech b...
2,2018-01-29,Benoît Cœuré,La situation économique dans la zone euro et l...,"Présentation de Benoît Cœuré, membre du direct...",La situation économique dans la zone euro et...
3,2018-01-30,Yves Mersch,The limits of central bank financing in resolu...,"Speech by Yves Mersch, Member of the Executive...",The limits of central bank financing in reso...
4,2018-01-31,Benoît Cœuré,What yield curves are telling us,"Speech by Benoît Cœuré, Member of the Executiv...",What yield curves are telling us Speech by...
...,...,...,...,...,...
375,2021-08-26,Isabel Schnabel,Die neue geldpolitische Strategie der Europäis...,"Vortrag von Isabel Schnabel, Mitglied des Dire...",
376,2021-09-13,Isabel Schnabel,New narratives on monetary policy – the spectr...,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH New narratives on monetary policy –...
377,2021-09-15,Isabel Schnabel,The monetary policy non-puzzle in bond markets,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH The monetary policy non-puzzle in b...
378,2021-09-15,Philip R. Lane,The ECB's monetary policy strategy review - IM...,"Presentation by Philip R. Lane, Member of the ...",


## Concatenating central bank info
Once we mined the info from CBs, let's create CB dataset and then merge it with the modelling dataset.

In [None]:
def convert_date_format(date_str):
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    return date_obj.strftime('%Y-%m-%d')

ECBpress = pd.read_csv("ECB_Press.csv").reset_index(drop=True).iloc[:,1:]
ECBpress["Speaker"] = "ECB"
ECBpress = ECBpress[["Date", "Title", "Topic", "Speaker", "Content"]]

ECBspeech = pd.read_csv("ECB_Speeches.csv").reset_index(drop=True).iloc[:,1:]
ECBspeech.columns = ["Date", "Speaker", "Title", "Subtitle", "Content"]
ECBspeech["Speaker"] = "ECB"
ECBspeech["Topic"] = "Speech"
ECBspeech["Content"] = ECBspeech["Subtitle"] + "  " + ECBspeech["Content"]
ECBspeech = ECBspeech[["Date", "Title", "Topic", "Speaker", "Content"]]

FEDpress = pd.read_csv("FED_Press.csv")
FEDpress.columns = ["Link", "Title", "Topic", "Year", "Content", "Date", "TextLen"]
FEDpress["Speaker"] = "FED"
FEDpress["Topic"] = "PressRelease"
FEDpress = FEDpress[["Date", "Title", "Topic", "Speaker", "Content"]]
FEDpress["Date"] = FEDpress["Date"].astype(str).apply(convert_date_format)

FEDspeech = pd.read_csv("FED_Speeches.csv")
FEDspeech.columns = ["Link", "Title", "Speaker", "Event", "Year", "Content", "Date", "TextLen", "Location"]
FEDspeech["Topic"] = "Speech"
FEDspeech["Content"] = FEDspeech["Event"] + "  " + FEDspeech["Content"]
FEDspeech["Speaker"] = "FED"
FEDspeech = FEDspeech[["Date", "Title", "Topic", "Speaker", "Content"]]
FEDspeech["Date"] = FEDspeech["Date"].astype(str).apply(convert_date_format)
FEDspeech = FEDspeech[FEDspeech["Date"] <= "2021-09-21"]

In [None]:
ECBpress.sort_values("Date").reset_index(drop=True)

In [None]:
# Final concatenation
AllSpeaches = pd.concat([ECBpress, ECBspeech, FEDpress, FEDspeech]).sort_values(by="Date").reset_index(drop=True)


AllSpeaches = AllSpeaches.dropna().reset_index(drop=True)
AllSpeaches

# Save if needed
#AllSpeaches.to_csv("AllSpeaches.csv")