# Appendix: Download and process PDF files from FOMC Website

## 1 Download PDF files

In [None]:
import urllib.request
from tqdm import tqdm
import re
import os

In [None]:
# open the url and read all the links on it
def get_html(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expression and match strings
def get_url(html, reg):
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode('UTF-8'))
    return url_lst

# download the file
def get_file(url, file, dir):
    try:
        u = urllib.request.urlopen(url)
        # u = urllib.request.urlopen(url, timeout=30)
        f = open(dir + file, 'wb')
        block_size = 8192
        while True:
            buffer = u.read(block_size)
            if not buffer:
                break

            f.write(buffer)
        f.close()
        u.close()
    except urllib.error.HTTPError:
        print(urllib.error.HTTPError)

In [None]:
dir_stat = '.\pdf_stat\\'
if os.path.exists(dir_stat) == False:
    os.mkdir(dir_stat)
else:
    pass

dir_min = '.\pdf_min\\'
if os.path.exists(dir_min) == False:
    os.mkdir(dir_min)
else:
    pass

In [None]:
# pdf files are available during 2016-2021
root_url = 'https://www.federalreserve.gov/monetarypolicy/files/'
index_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'  
html = get_html(index_url)

In [None]:
url_lst_stat = get_url(html, r'(monetary\d{8}a1)')
print('Downloading statements from 2016 to 2021...')
for url in tqdm(url_lst_stat):
    if url + '.pdf' not in os.listdir(dir_stat):
        url = root_url + url + '.pdf'
        get_file(url, url.split('/')[-1], dir_stat)
    else:
        pass
print('Done.')

In [None]:
url_lst_min = get_url(html, r'(fomcminutes\d{8})')
print('Downloading minutes from 2016 to 2021...')
for url in tqdm(url_lst_min):
    if url + '.pdf' not in os.listdir(dir_min):
        url = root_url + url + '.pdf'
        get_file(url, url.split('/')[-1], dir_min)
    else:
        pass
print('Done.')

## 2 Convert PDF into txt files

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
import os
import sys

In [None]:
def read_pdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdf_file)
    device.close()

    contents = retstr.getvalue()
    retstr.close()

    return contents


def save_txt(txt, txt_name):
    with open(resource_path(r'txt\\') + txt_name, "w", encoding='utf-8') as f:
        f.write(txt)

## 3 Construct training samples for Doc2Vec model

### 3.1 Define functions

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
import time
import re

In [None]:
# remove '\n' in txt files
def txt_to_str(txt_file):
    with open(txt_file, encoding='utf-8', errors='ignore') as f:
        contents = f.read().replace('\n', ' ')
    if contents == '':
        print('Empty str encounted in' + txt_file + '. Please check.')
    return contents

In [None]:
# construct TaggedDocument data with sentences
def str_to_sentences(string, tag=False):
    train_content = []
    count = 0
    if string == '':
        print('Cannot process empty string. Please check.')
    else:
        if tag == True:
            for item in string.split('. '):
                try:
                    train_content.append(TaggedDocument(item, [str(count)]))
                    count += 1
                except ValueError:
                    pass
            return train_content
        else:
            for item in string.split('. '):
                try:
                    train_content.append(item)
                except ValueError:
                    pass
            return train_content

In [None]:
# construct TaggedDocument data with documents
def label_doc(docs, labels, tag=True):
    train_content = []
    if tag == True:
        for count, item in enumerate(docs):
            try:
                train_content.append(TaggedDocument(item, [labels[count]]))
            except ValueError:
                pass
        return train_content
    else:
        for item in docs:
            try:
                train_content.append(item)
            except ValueError:
                pass
        return train_content

### 3.2 Construct training samples

In [None]:
txt_stat_path = './txt_stat/'
txt_min_path = './txt_min/'

In [None]:
pdf_stat_list = os.listdir(dir_stat)
for file in tqdm(pdf_list):
    txt = read_pdf(open(pdf_path + file, 'rb'))
    save_txt(txt, file.split('.', 1)[0] + '.txt')

In [None]:
txt_stat_list = os.listdir(txt_stat_path)
raw_contents = []
for file in txt_stat_list:
    contents = txt_to_str(txt_stat_path + file)
    raw_contents.append(contents)
train_data  = label_doc(raw_contents, txt_list)

In [None]:
txt_min_list = os.listdir(txt_min_path)
raw_contents = []
for file in txt_min_list:
    contents = txt_to_str(txt_min_path + file)
    raw_contents.append(contents)
train_data  = label_doc(raw_contents, txt_list)

## 4 Build and train a Doc2Vec model

In [None]:
# initialize a Doc2Vec model using distributed memory method
model = Doc2Vec(dm=1, vector_size=20, window=5, min_count=2, workers=4)

In [None]:
# train and save the model
model.build_vocab(train_data)
model.train(train_data, total_examples=model.corpus_count, epochs=50)
model.save('doc2vec_txt.model')

## 5 Obtain document vectors

In [None]:
# obtain doc vectors for statements
vec_stat = pd.DataFrame(np.zeros([len(txt_stat_list), model.vector_size]))
vec_min = pd.DataFrame(np.zeros([len(txt_min_list), model.vector_size]))
for i in range(len(txt_stat_list)):
    file = txt_stat_list[i]
    if re.findall(r'\d{8}', file) is not None:
        contents = txt_to_str(txt_stat_path + file)
        test_content = str_to_sentences(contents, tag=False)
        vec = model.infer_vector(test_content)
        vec_stat.loc[i, 'file'] = file.split('.', 1)[0]
        vec_stat.iloc[i, : model.vector_size] = vec
        vec_stat.loc[i, 'date'] = re.findall(r'\d{8}', file)
    else:
        pass
vec_stat.set_index('date', inplace=True)
vec_stat.to_excel('vec_stat_pdf.xlsx')
vec_stat

In [None]:
# obtain doc vectors for minutes
for i in range(len(txt_min_list)):
    file = txt_min_list[i]
    if re.findall(r'\d{8}', file) is not None:
        contents = txt_to_str(txt_min_path + file)
        test_content = str_to_sentences(contents, tag=False)
        vec = model.infer_vector(test_content)
        vec_min.loc[i, 'file'] = file.split('.', 1)[0]
        vec_min.iloc[i, : model.vector_size] = vec
        vec_min.loc[i, 'date'] = re.findall(r'\d{8}', file)
    else:
        pass
vec_min.set_index('date', inplace=True)
vec_min.to_excel('vec_min_pdf.xlsx')
vec_min