# Appendix: Download and Process PDF Files from FOMC Website

## 1. Download PDF files

In [1]:
import urllib.request
from tqdm import tqdm
import re
import os

In [2]:
# open the url and read all the links on it
def get_html(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expression and match strings
def get_url(html, reg):
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode('UTF-8'))
    return url_lst

# download the file
def get_file(url, file, dir):
    try:
        u = urllib.request.urlopen(url)
        # u = urllib.request.urlopen(url, timeout=30)
        f = open(dir + file, 'wb')
        block_size = 8192
        while True:
            buffer = u.read(block_size)
            if not buffer:
                break

            f.write(buffer)
        f.close()
        u.close()
    except urllib.error.HTTPError:
        print(urllib.error.HTTPError)

In [3]:
dir_stat = '.\pdf_stat\\'
if os.path.exists(dir_stat) == False:
    os.mkdir(dir_stat)
else:
    pass

dir_min = '.\pdf_min\\'
if os.path.exists(dir_min) == False:
    os.mkdir(dir_min)
else:
    pass

In [4]:
# pdf files are available during 2016-2021
root_url = 'https://www.federalreserve.gov/monetarypolicy/files/'
index_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'  
html = get_html(index_url)

In [5]:
url_lst_stat = get_url(html, r'(monetary\d{8}a1)')
print('Downloading statements from 2016 to 2021...')
for url in tqdm(url_lst_stat):
    if url + '.pdf' not in os.listdir(dir_stat):
        url = root_url + url + '.pdf'
        get_file(url, url.split('/')[-1], dir_stat)
    else:
        pass
print('Done.')

Downloading statements from 2016 to 2021...


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [06:33<00:00,  4.19s/it]

Done.





In [6]:
url_lst_min = get_url(html, r'(fomcminutes\d{8})')
print('Downloading minutes from 2016 to 2021...')
for url in tqdm(url_lst_min):
    if url + '.pdf' not in os.listdir(dir_min):
        url = root_url + url + '.pdf'
        get_file(url, url.split('/')[-1], dir_min)
    else:
        pass
print('Done.')

Downloading minutes from 2016 to 2021...


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [10:30<00:00,  6.86s/it]

Done.





## 2. Convert PDF into txt files

In [7]:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
import os
import sys

In [8]:
def read_pdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdf_file)
    device.close()

    contents = retstr.getvalue()
    retstr.close()

    return contents

In [19]:
def save_txt(txt, txt_name, txt_dir):
    with open(txt_dir + txt_name, "w", encoding='utf-8') as f:
        f.write(txt)

## 3. Construct training samples for Doc2Vec model

### 3.1 Define functions

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
import time
import re

In [10]:
# remove '\n' in txt files
def txt_to_str(txt_file):
    with open(txt_file, encoding='utf-8', errors='ignore') as f:
        contents = f.read().replace('\n', ' ')
    if contents == '':
        print('Empty str encounted in' + txt_file + '. Please check.')
    return contents

In [11]:
# construct TaggedDocument data with sentences
def str_to_sentences(string, tag=False):
    train_content = []
    count = 0
    if string == '':
        print('Cannot process empty string. Please check.')
    else:
        if tag == True:
            for item in string.split('. '):
                try:
                    train_content.append(TaggedDocument(item, [str(count)]))
                    count += 1
                except ValueError:
                    pass
            return train_content
        else:
            for item in string.split('. '):
                try:
                    train_content.append(item)
                except ValueError:
                    pass
            return train_content

In [12]:
# construct TaggedDocument data with documents
def label_doc(docs, labels, tag=True):
    train_content = []
    if tag == True:
        for count, item in enumerate(docs):
            try:
                train_content.append(TaggedDocument(item, [labels[count]]))
            except ValueError:
                pass
        return train_content
    else:
        for item in docs:
            try:
                train_content.append(item)
            except ValueError:
                pass
        return train_content

### 3.2 Construct training samples

In [13]:
txt_stat_path = './txt_stat/'
txt_min_path = './txt_min/'

In [22]:
pdf_stat_list = os.listdir(dir_stat)
for file in pdf_stat_list:
    txt = read_pdf(open(dir_stat + file, 'rb'))
    save_txt(txt, file.split('.', 1)[0] + '.txt', txt_stat_path)



Warnings are reported because pdfminer cannot locate some of the contents such as logos and special symbols while reading the pdf files.

In [24]:
pdf_min_list = os.listdir(dir_min)
for file in pdf_min_list:
    txt = read_pdf(open(dir_min + file, 'rb'))
    save_txt(txt, file.split('.', 1)[0] + '.txt', txt_min_path)



















































In [26]:
txt_stat_list = os.listdir(txt_stat_path)
raw_contents = []
for file in txt_stat_list:
    contents = txt_to_str(txt_stat_path + file)
    raw_contents.append(contents)
train_data  = label_doc(raw_contents, txt_stat_list)

In [34]:
txt_min_list = os.listdir(txt_min_path)
raw_contents = []
for file in txt_min_list:
    contents = txt_to_str(txt_min_path + file)
    raw_contents.append(contents)
train_data  = label_doc(raw_contents, txt_min_list)

## 4. Build and train a Doc2Vec model

In [35]:
# initialize a Doc2Vec model using distributed memory method
model = Doc2Vec(dm=1, vector_size=20, window=5, min_count=2, workers=4)

In [36]:
# train and save the model
model.build_vocab(train_data)
model.train(train_data, total_examples=model.corpus_count, epochs=50)
model.save('doc2vec_txt.model')



## 5. Obtain document vectors

In [37]:
# obtain doc vectors for statements
vec_stat = pd.DataFrame(np.zeros([len(txt_stat_list), model.vector_size]))
vec_min = pd.DataFrame(np.zeros([len(txt_min_list), model.vector_size]))
for i in range(len(txt_stat_list)):
    file = txt_stat_list[i]
    if re.findall(r'\d{8}', file) is not None:
        contents = txt_to_str(txt_stat_path + file)
        test_content = str_to_sentences(contents, tag=False)
        vec = model.infer_vector(test_content)
        vec_stat.loc[i, 'file'] = file.split('.', 1)[0]
        vec_stat.iloc[i, : model.vector_size] = vec
        vec_stat.loc[i, 'date'] = re.findall(r'\d{8}', file)
    else:
        pass
vec_stat.set_index('date', inplace=True)
vec_stat.to_excel('vec_stat_pdf.xlsx')
vec_stat

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,file
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20160127,-0.007876,0.015897,0.024243,0.00276,-6e-06,-0.002969,-0.001833,0.009223,-0.022635,0.008876,...,-0.009222,-0.013769,-0.012264,-0.006189,0.001022,0.024422,-0.01335,-0.014459,-0.010979,monetary20160127a1
20160316,0.021014,0.022359,0.022995,0.006009,0.000957,-0.018556,0.010689,0.002621,0.020347,-0.022328,...,0.018613,-0.010741,-0.020612,-0.015346,0.023449,-0.020201,-0.003289,-0.004583,0.02265,monetary20160316a1
20160427,0.013353,-0.007121,-0.008631,0.005814,0.000156,0.000425,-0.009327,-0.022078,0.008516,-0.022242,...,-0.001991,0.014618,-0.007546,-0.015265,-0.000853,-0.023666,0.018686,-0.01065,0.005748,monetary20160427a1
20160615,0.021921,-0.012279,0.001858,0.016004,-0.02154,-0.022716,0.019951,0.009664,0.00078,-0.006731,...,-0.018157,-0.008302,0.021224,-0.016932,-0.004973,-0.00639,0.021398,-0.006968,0.008954,monetary20160615a1
20160727,-0.002113,-0.015112,0.008443,0.004247,0.015858,-0.015285,0.01987,-0.003553,-0.014149,-0.005797,...,0.016315,0.024449,0.000639,-0.009604,-0.013218,-0.015366,0.012879,0.0015,0.00944,monetary20160727a1
20160921,-0.011365,0.022571,0.013406,0.018066,0.00288,-0.016868,-0.015135,0.000393,0.023737,-0.008646,...,0.010148,0.018991,0.006694,0.018007,-0.004369,0.021494,0.001003,0.013668,0.001637,monetary20160921a1
20161102,0.011238,-0.023362,0.00021,0.014109,0.022998,0.014038,0.010712,0.020428,0.000862,0.020726,...,-0.016416,0.00747,-0.01615,-0.012325,-0.008902,0.007133,-0.001255,-0.012249,-0.011826,monetary20161102a1
20161214,-0.016457,-0.008148,0.004281,0.018169,-0.00233,-0.016699,0.019797,-0.017692,0.001348,0.021931,...,0.00656,0.004174,-0.018387,-0.018672,-0.011373,-0.00162,0.006303,0.015053,-0.013348,monetary20161214a1
20170201,0.023765,0.015234,0.019469,-0.001382,-0.019129,-0.00998,0.010161,0.013869,-0.001293,0.011482,...,0.023762,-0.021263,0.013186,-0.005077,-0.016906,-0.014289,0.003382,0.007474,-0.000253,monetary20170201a1
20170315,-0.015675,0.007259,0.02265,0.009874,-0.017463,0.014224,-0.022397,0.014078,-0.022637,-0.004554,...,0.011484,-0.00608,0.006902,0.020921,0.002148,0.02183,-0.018999,-0.008588,0.015147,monetary20170315a1


In [38]:
# obtain doc vectors for minutes
for i in range(len(txt_min_list)):
    file = txt_min_list[i]
    if re.findall(r'\d{8}', file) is not None:
        contents = txt_to_str(txt_min_path + file)
        test_content = str_to_sentences(contents, tag=False)
        vec = model.infer_vector(test_content)
        vec_min.loc[i, 'file'] = file.split('.', 1)[0]
        vec_min.iloc[i, : model.vector_size] = vec
        vec_min.loc[i, 'date'] = re.findall(r'\d{8}', file)
    else:
        pass
vec_min.set_index('date', inplace=True)
vec_min.to_excel('vec_min_pdf.xlsx')
vec_min

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,file
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20160127,-0.017573,-0.004033,0.017484,1.8e-05,-0.006594,-0.022385,-0.005872,0.007935,-0.00862,-0.014268,...,0.007604,0.024807,0.008354,-0.02078,0.020542,-0.002247,0.003618,-0.023899,0.011992,fomcminutes20160127
20160316,-0.024023,-0.014612,-0.000676,-0.021333,-0.012202,0.023846,0.022783,0.009473,-0.018503,-0.00154,...,-0.004019,0.003594,-0.01448,-0.006666,-0.009904,0.011303,0.015638,-0.010046,0.000278,fomcminutes20160316
20160427,-0.009483,-0.0214,-0.020217,-0.020622,-0.017765,-0.013586,-0.010085,0.024625,0.004016,0.015415,...,-0.005485,-0.012985,-0.002154,0.007562,-0.017655,-0.003884,-0.014993,0.009109,0.013939,fomcminutes20160427
20160615,0.005206,-0.024851,-0.021739,0.023006,0.021305,0.015038,0.011444,-0.001719,-0.024474,0.024685,...,-0.004528,-0.007042,0.014038,0.00372,0.007634,-0.015104,0.023519,-0.01633,0.008421,fomcminutes20160615
20160727,-0.354552,-0.138171,-0.187312,-0.215533,0.074018,-0.538037,0.302605,-0.280927,-0.343291,0.20292,...,-0.299644,-0.25462,-0.045079,0.235232,0.383197,0.335081,-0.105356,0.138859,0.213075,fomcminutes20160727
20160921,0.010702,0.015152,-0.024089,-0.006218,0.017446,-0.021784,0.023812,-0.002408,-0.012812,0.022661,...,-0.000954,-4.9e-05,0.00884,-0.021297,-0.020505,-0.019806,0.011161,0.012225,0.016888,fomcminutes20160921
20161102,-0.007434,-0.015474,0.017831,-0.007604,-0.021326,0.018456,-0.02433,0.006877,-0.008206,0.012887,...,0.021085,0.00934,0.022447,0.001487,0.00863,0.000432,-0.012226,-0.014912,-0.00698,fomcminutes20161102
20161214,0.003255,0.022834,-0.001323,-0.003383,0.024815,-0.012722,-0.00811,0.018393,-0.018145,-0.020264,...,-0.024765,0.021857,-0.00572,0.009144,-0.024638,0.005047,-0.02306,0.021777,0.014249,fomcminutes20161214
20170201,0.010044,-0.00991,0.014483,0.00106,0.013946,-0.00439,-0.008672,0.022617,-0.015514,-0.002352,...,0.006407,0.018754,-0.010121,0.013846,0.024681,-0.014422,0.00602,0.005697,0.004282,fomcminutes20170201
20170315,-0.023711,0.023908,0.010165,0.019468,0.012558,-0.022145,0.006918,0.015425,0.014323,-0.015609,...,0.002677,-0.014447,0.019494,0.000715,0.001242,-0.019882,-0.014327,0.023108,-0.017109,fomcminutes20170315
