<img align="left" src="https://statics.phbs.pku.edu.cn/uploadfile/2018/0119/20180119080526567.png" style="margin-top:50px">
<h1 align="right" style="margin-top:60px">Machine Learning for Finance (FIN 570) </h1>
<h1 align="right" style="margin-top:20px">Module 1, 2021-2022 Fall </h1>

<h1 align="center" style="margin-top:40px">Sentiment Analysis on Central Bank Statement (Part 1)</h1>

<center>
<font color=black size=4 face=times> Team: Hu Xueyang & Zhai Sihan<br>
    Instructor: Jaehyuk Choi<br>
<font color=black size=3 face=times><center>(Last Modified on Nov 19, 2021)
<center>

## 0 Collect HTML Files from FOMC Website

In [28]:
import urllib.request
import urllib.error
import re
import os
import time
from tqdm import tqdm
import pandas as pd

In [2]:
# open the url and read all the links on it
def get_html(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expression and match strings
def get_url(html, reg):
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode('UTF-8'))
    return url_lst

# download the file
def get_file(url, file, dir):
    try:
        u = urllib.request.urlopen(url)
        # u = urllib.request.urlopen(url, timeout=30)
        f = open(dir + file, 'wb')
        block_size = 8192
        while True:
            buffer = u.read(block_size)
            if not buffer:
                break

            f.write(buffer)
        f.close()
        u.close()
    except urllib.error.HTTPError:
        print(urllib.error.HTTPError)

### 0.1 Download FOMC statements

In [3]:
dir_stat = '.\html_stat\\'
if os.path.exists(dir_stat) == False:
    os.mkdir(dir_stat)
else:
    pass

In [4]:
# 2016-2021
root_url_stat = 'https://www.federalreserve.gov/newsevents/pressreleases/'  # common part in the statement links
index_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'  # location of the links
html = get_html(index_url)  # collect all links on the website
url_lst_stat = get_url(html, r'(monetary\d{8}a)')  # match strings
print('Downloading statements from 2016 to 2021...')
for url in tqdm(url_lst_stat):
    x = re.findall('\d{8}', url)[0] + '.htm'
    if x not in os.listdir(dir_stat):
        url = root_url_stat + url + '.htm'
        get_file(url, x, dir_stat)
    else:
        pass
print('Done.')

Downloading statements from 2016 to 2021...


100%|████████████████████████████████████████████████████████████████████████████████| 146/146 [03:28<00:00,  1.43s/it]

Done.





In [7]:
# before 2016
root_url_history = 'https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm'
root_url_stat = 'https://www.federalreserve.gov/newsevents/pressreleases/'
print('Downloading statements from 1994 to 2015...')
url_lst_stat = []
for year in range(1994, 2016):
    index_url = 'https://www.federalreserve.gov/monetarypolicy/fomchistorical' + str(year) + '.htm'
    html = get_html(index_url)
    print(year)
    if year < 1996:
        root_url_stat = 'https://www.federalreserve.gov/fomc/'
        url_lst_stat = get_url(html, r'(\d{8}default)')
    elif year == 1996:
        root_url_stat = 'https://www.federalreserve.gov/fomc/'
        url_lst_stat = get_url(html, r'(\d{8}DEFAULT)')
    elif year == 2002:
        root_url_stat = 'https://www.federalreserve.gov/boarddocs/press/'
        url_lst_stat = get_url(html, r'(monetary/\d{4}/\d{8}/)')
        for url in url_lst_stat:
            x = re.findall('\d{8}', url)[0] + '.htm'
            if x not in os.listdir(dir_stat):
                url = root_url_stat + url
                get_file(url, x, dir_stat)
            else:
                pass
    elif year < 2003:
        root_url_stat = 'https://www.federalreserve.gov/boarddocs/press/'
        url_lst_stat = get_url(html, r'(general/\d{4}/\d{8}/)')
        for url in url_lst_stat:
            x = re.findall('\d{8}', url)[0] + '.htm'
            if x not in os.listdir(dir_stat):
                url = root_url_stat + url
                get_file(url, x, dir_stat)
            else:
                pass
    elif year < 2006:
        root_url_stat = 'https://www.federalreserve.gov/boarddocs/press/'
        url_lst_stat = get_url(html, r'(monetary/\d{4}/\d{8}/default)')
    elif year < 2011:
        root_url_stat = 'https://www.federalreserve.gov/newsevents/press/'
        url_lst_stat = get_url(html, r'(monetary/\d{8}\w)')
    elif year < 2016:
        root_url_stat = 'https://www.federalreserve.gov/newsevents/pressreleases/'
        url_lst_stat = get_url(html, r'(monetary\d{8}\w)')
    else:
        pass

    for url in url_lst_stat:
        x = re.findall('\d{8}', url)[0] + '.htm'
        if x not in os.listdir(dir_stat):
            url = root_url_stat + url + '.htm'
            get_file(url, x, dir_stat)
        else:
            pass
print('Done.')

Downloading statements from 1994 to 2015...
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
Done.


### 0.2 Download FOMC minutes

In [10]:
dir_min = '.\html_min\\'
if os.path.exists(dir_min) == False:
    os.mkdir(dir_min)
else:
    pass

In [11]:
# 2016-2021
start_time = time.time()
root_url_min = 'https://www.federalreserve.gov/monetarypolicy/'
index_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'
html = get_html(index_url)
url_lst_min = get_url(html, r'(fomcminutes\d{8})')
print('Downloading minutes from 2016 to 2021...')
for url in tqdm(url_lst_min):
    x = re.findall('\d{8}', url)[0] + '.htm'
    if x not in os.listdir(dir_min):
        url = root_url_min + url + '.htm'
        get_file(url, x, dir_min)
    else:
        pass
print('Done.')


Downloading minutes from 2016 to 2021...


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [00:44<00:00,  2.05it/s]

Done.





In [20]:
# before 2016
root_url_history = 'https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm'
root_url_stat = 'https://www.federalreserve.gov/newsevents/pressreleases/'
root_url_min = 'https://www.federalreserve.gov/monetarypolicy/'
print('Downloading minutes from 1994 to 2015...')
url_lst_min = []
for year in range(1994, 2016):
    index_url = 'https://www.federalreserve.gov/monetarypolicy/fomchistorical' + str(year) + '.htm'
    html = get_html(index_url)
    print(year)
    if year < 1996:
        root_url_min = 'https://www.federalreserve.gov/fomc/'
        url_lst_min = get_url(html, r'(MINUTES/\d{4}/\d{8}min)')
    elif year < 2008:
        root_url_min = 'https://www.federalreserve.gov/fomc/'
        url_lst_min = get_url(html, r'(minutes/\d{8})')
    elif year < 2016:
        root_url_min = 'https://www.federalreserve.gov/monetarypolicy/'
        url_lst_min = get_url(html, r'(fomcminutes\d{8})')
    else:
        pass
    for url in url_lst_min:
        x = re.findall('\d{8}', url)[0] + '.htm'
        if x not in os.listdir(dir_min):
            url = root_url_min + url + '.htm'
            get_file(url, x, dir_min)
        else:
            pass
print('Done.')

Downloading minutes from 1994 to 2015...
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
<class 'urllib.error.HTTPError'>
2009
2010
2011
2012
<class 'urllib.error.HTTPError'>
2013
2014
2015
Done.


##### Since formats of the links vary a lot, we download the file mannually if it reports an http error.

### 0.3 Collect the publication dates of minutes

In [21]:
dir_year = '.\html_year\\'
if os.path.exists(dir_year) == False:
    os.mkdir(dir_year)
else:
    pass
date_lst = []

In [22]:
# before 2016
for year in range(1993, 2016, 1):
    print(year)
    index_url = 'https://www.federalreserve.gov/monetarypolicy/fomchistorical' + str(year) + '.htm'  # before 2015
    if index_url.split('/')[-1] not in os.listdir(dir_year):
        get_file(index_url, index_url.split('/')[-1], dir_year)
    else:
        pass
    html = get_html(index_url)
    date = get_url(html, r'(Released\s\w{3,9}\s\d{1,2},\s\d{4})')
    for item in date:
        date_lst.append(item)
print('Done.')

1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
Done.


In [23]:
# 2016-2021
index_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'
get_file(index_url, index_url.split('/')[-1], dir_year)
html = get_html(index_url)
date = get_url(html, r'(Released\s\w{3,9}\s\d{1,2},\s\d{4})')
for item in date:
    date_lst.append(item)
print('Done.')

Done.


In [24]:
date_lst

['Released March 26, 1993',
 'Released May 21, 1993',
 'Released July 9, 1993',
 'Released August 20, 1993',
 'Released September 24, 1993',
 'Released November 19, 1993',
 'Released December 23, 1993',
 'Released February 4, 1994',
 'Released March 25, 1994',
 'Released May 20, 1994',
 'Released July 8, 1994',
 'Released August 19, 1994',
 'Released September 30, 1994',
 'Released November 18, 1994',
 'Released December 22, 1994',
 'Released February 3, 1995',
 'Released March 31, 1995',
 'Released May 26, 1995',
 'Released July 7, 1995',
 'Released August 25, 1995',
 'Released September 29, 1995',
 'Released November 17, 1995',
 'Released December 22, 1995',
 'Released February 2, 1996',
 'Released March 29, 1996',
 'Released May 24, 1996',
 'Released July 5, 1996',
 'Released August 23, 1996',
 'Released September 27, 1996',
 'Released November 15, 1996',
 'Released December 20, 1996',
 'Released February 6, 1997',
 'Released March 27, 1997',
 'Released May 22, 1997',
 'Released Jul

In [25]:
# change the format of dates collected
import time
date_released = []
for date in date_lst:
    date = date[len('Released '):]
    try:
        date_format = time.strptime(date,'%B %d, %Y')
    except ValueError:
        date_format = time.strptime(date,'%b %d, %Y')
    date_format = time.strftime("%Y%m%d", date_format)
    date_released.append(date_format)
date_released.sort(reverse=False)

In [26]:
date_released

['19930326',
 '19930521',
 '19930709',
 '19930820',
 '19930924',
 '19931119',
 '19931223',
 '19940204',
 '19940325',
 '19940520',
 '19940708',
 '19940819',
 '19940930',
 '19941118',
 '19941222',
 '19950203',
 '19950331',
 '19950526',
 '19950707',
 '19950825',
 '19950929',
 '19951117',
 '19951222',
 '19960202',
 '19960329',
 '19960524',
 '19960705',
 '19960823',
 '19960927',
 '19961115',
 '19961220',
 '19970206',
 '19970327',
 '19970522',
 '19970703',
 '19970821',
 '19971002',
 '19971113',
 '19971218',
 '19980205',
 '19980402',
 '19980521',
 '19980702',
 '19980820',
 '19981001',
 '19981119',
 '19981223',
 '19990204',
 '19990401',
 '19990520',
 '19990701',
 '19990826',
 '19991007',
 '19991118',
 '19991223',
 '20000203',
 '20000323',
 '20000518',
 '20000629',
 '20000824',
 '20001005',
 '20001116',
 '20001221',
 '20010201',
 '20010322',
 '20010517',
 '20010628',
 '20010823',
 '20011004',
 '20011108',
 '20011213',
 '20020131',
 '20020321',
 '20020509',
 '20020627',
 '20020815',
 '20020926',

In [29]:
date_released = pd.DataFrame(date_released)
date_released.to_excel(dir_year + 'minutes_date.xlsx', index=False, header=None)