In [3]:
#!pip install selenium

In [13]:
# Import libraries
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm
import time

In [5]:
def retrieve_speeches(startDate, endDate):

    driver_path = "C:\\Users\\Taavi\\Desktop\\Learning NLP\\chromedriver-win64\\chromedriver.exe"
    service = Service(executable_path = driver_path)
    driver = webdriver.Chrome(service = service)

    driver.get('https://www.federalreserve.gov/newsevents/speeches.htm')

    time.sleep(5)

    docs = []

    while True:

        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')

        body_div = soup.find('div', class_ = 'angularEvents items ng-scope')
        speech_divs = body_div.find_all('div', class_ = 'row ng-scope')
        dates = []
        links = []
        for speech_div in speech_divs:

            date_div = speech_div.find('time')
            date = date_div['datetime']
            dates.append(date)

            link_div = speech_div.find('a')
            link = link_div['href']
            links.append(link)

        start = datetime.strptime(startDate, '%m/%d/%Y')
        end = datetime.strptime(endDate, '%m/%d/%Y')
        
        links_in_window = []

        for idx in range(len(dates)):
            date = datetime.strptime(dates[idx], '%m/%d/%Y')
            if start <= date <= end:
                links_in_window.append(links[idx])

        newDocs = []

        for link in links_in_window:
            doc_url = 'https://www.federalreserve.gov/' + link
            doc_response = requests.get(doc_url)
            doc_soup = BeautifulSoup(doc_response.text, 'html.parser')
            speech = doc_soup.find('div', class_ = 'col-xs-12 col-sm-8 col-md-8')
            text = speech.get_text()

            maxLength = 0
            maxText = ''
            for i in text.split('\n\n'):
                length = len(i)
                if len(i) > maxLength:
                    maxLength = len(i)
                    maxText = i.strip()

            newDocs.append(maxText)

        docs.extend(newDocs)

        if len(docs) > 0 and len(newDocs) == 0:
            driver.quit()
            break
        else:
            nextPage_button = driver.find_element(By.LINK_TEXT, 'Next')
            nextPage_button.click()

    return docs

In [6]:
def retrieve_rates(endDate, afterDate):
    series_id = 'DFF'
    api_key = os.getenv('FRED_API_KEY')
    file_type = 'json'
    observation_start = endDate.split('/')
    observation_start = '-'.join([observation_start[2], observation_start[0], observation_start[1]])
    observation_end = afterDate
    urlBase = 'https://api.stlouisfed.org/fred/series/observations?'

    url = (urlBase + 'series_id='+series_id + '&api_key='+api_key + '&file_type='+file_type +
           '&observation_start='+observation_start + '&observation_end='+observation_end)

    response = requests.get(url)
    if response.status_code == 200:
        try:
            json = response.json()
            rate1 = float(json['observations'][0]['value'])
            rate2 = float(json['observations'][-1]['value'])
            return rate1, rate2
        except requests.exceptions.JSONDecodeError:
            print('Failed to decode JSON. Raw response:')
            print(response.text)
    else:
        print(f'Failed to retrieve data: {response.status_code}')
        return None, None

In [7]:
windows = [
    ['01/31/2019', '03/18/2019'],
    ['03/21/2019', '04/29/2019'],
    ['05/02/2019', '06/17/2019'],
    ['06/20/2019', '07/29/2019'],
    ['08/01/2019', '09/16/2019'],
    ['09/19/2019', '10/03/2019'],
    ['10/05/2019', '10/28/2019'],
    ['10/31/2019', '12/09/2019'],

    ['12/12/2019', '01/27/2020'],
    ['04/01/2020', '04/27/2020'],
    ['04/30/2020', '06/08/2020'],
    ['06/11/2020', '07/27/2020'],
    ['07/30/2020', '08/26/2020'],
    ['08/27/2020', '09/14/2020'],
    ['09/17/2020', '11/03/2020'],
    ['11/06/2020', '12/14/2020'],

    ['12/17/2020', '01/25/2021'],
    ['01/28/2021', '03/15/2021'],
    ['03/18/2021', '04/26/2021'],
    ['04/29/2021', '06/14/2021'],
    ['06/17/2021', '07/26/2021'],
    ['07/29/2021', '09/20/2021'],
    ['09/23/2021', '11/01/2021'],
    ['01/04/2021', '12/13/2021'],

    ['12/16/2021', '01/24/2022'],
    ['01/27/2022', '03/14/2022'],
    ['03/17/2022', '05/02/2022'],
    ['05/05/2022', '06/13/2022'],
    ['06/16/2022', '07/25/2022'],
    ['07/28/2022', '09/19/2022'],
    ['09/22/2022', '10/31/2022'],
    ['11/03/2022', '12/12/2022'],

    ['12/15/2022', '01/30/2023'],
    ['02/02/2023', '03/20/2023'],
    ['03/23/2023', '05/01/2023'],
    ['05/04/2023', '06/12/2023'],
    ['06/15/2023', '07/24/2023'],
    ['07/27/2023', '09/18/2023'],
    ['09/21/2023', '10/30/2023'],
    ['11/02/2023', '12/11/2023'],

    ['12/14/2023', '01/29/2024'],
    ['02/01/2024', '03/18/2024'],
    ['03/21/2024', '04/29/2024'],
    ['05/02/2024', '06/10/2024'],
    ['06/13/2024', '07/29/2024'],
]

In [8]:
df = np.zeros((1, 4))
for window in tqdm(windows):
    startDate = window[0]
    endDate = window[1]
    docs = retrieve_speeches(startDate, endDate)

    afterDate = str(datetime.strptime(endDate, '%m/%d/%Y') + timedelta(days = 3))[0:10]
    rate1, rate2 = retrieve_rates(endDate, afterDate)

    if rate1 == None:
        print(window)
        print(endDate, afterDate)
        continue

    rateChange = rate2 - rate1
    ratesArray = np.array([[rate1, rateChange]])

    windowArray = np.zeros((1, 2))
    for doc in docs:
        pass
        # sentiment analyis here
        # output to array

        # 

        # docArray = ...
        
        # windowArray = np.concatenate((windowArray, docArray), axis = 0)
    
    #windowArray = windowArray[1:]
    #windowArray = np.mean(windowArray, axis = 0)
    windowArray = np.concatenate((windowArray, ratesArray), axis = 1)

    df = np.concatenate((df, windowArray), axis = 0)

df = df[1:]

In [21]:
# Example of a full speech
print(docs[2])

Thank you, Jeff, and thank you to the Federal Reserve Bank of Kansas City for the opportunity to speak to you today.1 So far, 2024 has been a challenging year for economic forecasters, and for monetary policymakers. After significant progress in 2023 toward the Federal Open Market Committee's (FOMC) price-stability goal, inflation jumped in the first quarter. At the same time, both the labor market and economic growth ran strong enough that some commentators wondered whether monetary policy was restrictive enough and whether rate hikes should be back on the table. These twists and turns in the economic data shifted everyone's expectations back and forth as to when the FOMC might begin lowering its policy interest rate and how many cuts there would be this year. During this time, my consistent view was that there was no urgency to cut rates until the Committee is confident that inflation is returning sustainably to 2 percent.
Then, in the second quarter, data on inflation and the labor 

# List of Features to Engineer (not comprehensive):
- infl_numPos
- infl_numNeg
- infl_whole

- empl_numPos
- empl_numNeg
- empl_whole

- grth_numPos
- grth_numNeg
- grth_whole

- cnsm_numPos
- cnsm_numNeg
- cnsm_whole

- mrkt_numPos
- mrkt_numNeg
- mrkt_whole