## Scrape text from FOMC statements and Calculate Yield Changes

### 1. Import and mess with statements

In [3]:
import PyPDF2
import os
import pandas as pd

#https://fraser.stlouisfed.org/title/677#576435

In [5]:
pdfFileObj = open('FOMC_statements/19940204statement.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages

FileNotFoundError: [Errno 2] No such file or directory: 'FOMC_statements/19940204statement.pdf'

In [4]:
pageObj = pdfReader.getPage(0)
pageObj.extractText()

"Release Date: February 4, 1994\n For immediate releaseChairman Alan Greenspan announced today that the Federal Open Market Committee decided toincrease slightly the degree of pressure on reserve positions. The action is expected to be associatedwith a small increase in short-term money market interest rates.\nThe decision was taken to move toward a less accommodative stance in monetary policy in order tosustain and enhance the economic expansion.Chairman Greenspan decided to announce this action immediately so as to avoid anymisunderstanding of the Committee's purposes, given the fact that this is the first firming of reservemarket conditions by the Committee since early 1989.Home | Press releases\nAccessibility\n | Contact UsLast update: April 20, 2007\n"

In [5]:
statements = []
dates = []

# loop through files
for filename in os.listdir('FOMC_statements'):
    if filename.endswith(".pdf"):
        
        # add date to list
        dt = filename.replace('statement.pdf', '')
        dates.append(dt)
        
        # read specific file
        pdfFileObj = open('FOMC_statements/' + filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        
        p = ''
        # loop through pages
        for i in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(i)
            p += pageObj.extractText()
            
        statements.append(p)
        

In [6]:
data = pd.DataFrame({'date': dates, 'text': statements})

In [9]:
data.head()
data.to_csv("fomc_statements.csv")

#### Note

The code above was used to scrape the pdfs, but the text was still very messy, so I manually cleaned a bunch of them.

In [4]:
# Reload cleaned statements
data = pd.read_csv("fomc_statements_clean.csv")

In [5]:
# Clean dates
data.date = pd.to_datetime(data.date, format='%Y%m%d')
data = data.sort_values('date')
data.head()

Unnamed: 0,date,text
0,1994-02-04,Chairman Alan Greenspan announced today that t...
1,1994-03-22,Chairman Alan Greenspan announced today that t...
2,1994-04-18,Chairman Alan Greenspan announced today that t...
3,1994-05-17,The Federal Reserve today announced two action...
4,1994-08-16,The Federal Reserve announced today the follow...


### 2. Import and mess with treasury data

In [6]:
treas = pd.read_csv('USTREASURY-YIELD.csv')
treas.Date = pd.to_datetime(treas.Date, format='%Y-%m-%d')
treas = treas.sort_values('Date')
treas.head()

Unnamed: 0,Date,1 MO,3 MO,6 MO,1 YR,2 YR,3 YR,5 YR,7 YR,10 YR,20 YR,30 YR
7121,1990-01-02,,7.83,7.89,7.81,7.87,7.9,7.87,7.98,7.94,,8.0
7120,1990-01-03,,7.89,7.94,7.85,7.94,7.96,7.92,8.04,7.99,,8.04
7119,1990-01-04,,7.84,7.9,7.82,7.92,7.93,7.91,8.02,7.98,,8.04
7118,1990-01-05,,7.79,7.85,7.79,7.9,7.94,7.92,8.03,7.99,,8.06
7117,1990-01-08,,7.79,7.88,7.81,7.9,7.95,7.92,8.05,8.02,,8.09


In [7]:
h = treas.loc[:, treas.columns != 'Date'].diff()
h.insert(loc=0, column='date', value=treas['Date'])
h = h.set_index('date')
h.head()

Unnamed: 0_level_0,1 MO,3 MO,6 MO,1 YR,2 YR,3 YR,5 YR,7 YR,10 YR,20 YR,30 YR
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1990-01-02,,,,,,,,,,,
1990-01-03,,0.06,0.05,0.04,0.07,0.06,0.05,0.06,0.05,,0.04
1990-01-04,,-0.05,-0.04,-0.03,-0.02,-0.03,-0.01,-0.02,-0.01,,0.0
1990-01-05,,-0.05,-0.05,-0.03,-0.02,0.01,0.01,0.01,0.01,,0.02
1990-01-08,,0.0,0.03,0.02,0.0,0.01,0.0,0.02,0.03,,0.03


In [8]:
# Subset only FOMC statement dates
dts = list(data['date'])
g = h.loc[dts]
g.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,1 MO,3 MO,6 MO,1 YR,2 YR,3 YR,5 YR,7 YR,10 YR,20 YR,30 YR
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1994-02-04,,0.1,0.12,0.11,0.14,0.15,0.15,0.14,0.13,0.11,0.06
1994-03-22,,-0.1,-0.08,-0.08,-0.06,-0.11,-0.11,-0.12,-0.11,-0.09,-0.09
1994-04-18,,0.16,0.19,0.18,0.19,0.2,0.2,0.2,0.17,0.13,0.12
1994-05-17,,-0.01,-0.1,-0.12,-0.15,-0.17,-0.19,-0.23,-0.21,-0.19,-0.19
1994-08-16,,0.02,-0.02,-0.02,-0.05,-0.06,-0.1,-0.12,-0.11,-0.12,-0.12


In [9]:
# Write data to csv
g.to_csv('treas_changes.csv')

In [14]:
labels = g['10 YR']
labels = [1 if i >= 0 else 0 for i in labels]

In [15]:
data['labels']= labels
data.head()

Unnamed: 0,date,text,labels
0,1994-02-04,Chairman Alan Greenspan announced today that t...,1
1,1994-03-22,Chairman Alan Greenspan announced today that t...,0
2,1994-04-18,Chairman Alan Greenspan announced today that t...,1
3,1994-05-17,The Federal Reserve today announced two action...,0
4,1994-08-16,The Federal Reserve announced today the follow...,0


In [16]:
data.to_csv('statements_with_labels.csv')