## Modules

In [86]:
## Sleeping
import time as tm
import random as rd

## Tidying
import numpy as np
import pandas as pd
import re
import datetime

## Scraping
from bs4 import BeautifulSoup # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from selenium import webdriver # https://selenium-python.readthedocs.io/locating-elements.html
from selenium.webdriver.chrome.options import Options

## NLP
import nltk
## Might need to download these
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
##
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/Cookie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Cookie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Cookie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Setup

In [87]:
# Beige Book URLs
BB_2020_url = 'https://www.federalreserve.gov/monetarypolicy/beige-book-default.htm'
BB_2019_1996_urls = 'https://www.federalreserve.gov/monetarypolicy/beige-book-archive.htm' # 2017-2020 has the same format

# Configure Chrome Options for webdriver

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome('/Users/Cookie/node_modules/chromedriver/lib/chromedriver/chromedriver', 
                          chrome_options = chrome_options)

## Class Creation

In [88]:
class data_engineering_1:
    
    def __init__(self):
        
        '''
        
        '''

    def year_link_create(self, yr:int):

        link = ('https://www.federalreserve.gov/monetarypolicy/beigebook' + str(yr) + '.htm')

        return link

    def find_links(self, BB_url:str):

        # print('Pulling ' + str(BB_url))

        driver.get(BB_url)

        # print('Loading page')

        tm.sleep(rd.randint(2, 4))

        soup = BeautifulSoup(driver.page_source, 'lxml')

        foundlinks = [str(l['href']) for l in soup.find_all("a", href=re.compile(r"[/]monetarypolicy[/]beigebook.*.htm"))]

        return foundlinks

    def format_links(self, link: str):

        if 'https://www.federalreserve.gov' not in link:

            return ('https://www.federalreserve.gov' + link)

        else:

            return link

    def simple_clean_corpus(self, corpus: str): 

                corpus = re.sub('\n|<p>|</p>|<br/>|<strong>.*</strong>', '', corpus)

                return corpus

    def pull_corpora_17_20(self, links:list, date = [], overallEconomicActivity = [], employmentPrices = []):    

        for n, link in enumerate(links):

            driver.get(link)

            # print('Loading link ' + str(n))

            tm.sleep(rd.randint(1, 3))

            reportSoup = BeautifulSoup(driver.page_source, 'lxml')

            if any(ext in link for ext in ['2017', '2018', '2019','2020']):

                date.append(re.sub('Last Update:|\n|\\s{2,}',
                                   '',
                                   reportSoup.find('div', {'class':'lastUpdate'}).text)) # Pulling date for the dataframe

                textSoup = [str(t) for t in reportSoup.find_all('p')]

                positionOEA = [i for i, s in enumerate(textSoup) if 'Overall Economic Activity' in s][0]
                positionEW = [i for i, s in enumerate(textSoup) if 'Employment and Wages' in s][0]
                positionP = [i for i, s in enumerate(textSoup) if 'Prices' in s][0]

                corpusOverallEconomicActivity = ''.join(textSoup[positionOEA:positionEW])
                corpusEmploymentPrices = ''.join(textSoup[positionEW:(positionP+1)])   

                overallEconomicActivity.append(self.simple_clean_corpus(corpusOverallEconomicActivity))

                employmentPrices.append(self.simple_clean_corpus(corpusEmploymentPrices))

                print('Corpora from link ' + str(n) + ' cleaned and collected')

        return date, overallEconomicActivity, employmentPrices        

    def str_to_datetime(self, date: str):

        date = re.sub('\\s', '', date)

        date = datetime.datetime.strptime(date, '%B%d,%Y')

        return date

    def get_wordnet_pos(self, word: str):
    
        """
        Map POS tag to first character lemmatize() accepts, from:
        https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizerwithappropriatepostag
        """

        tag = nltk.pos_tag([word])[0][1][0].upper()

        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    def remove_stop_lemma_words(self, col: str):
    
        '''
        Removes stop words and retrieves lemmas of the words in the statements
        I've chosen lemmas over stems because it is more nuanced and sophisticated. There's a good explanation here: 
        https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming
        '''

        stopWords = set(stopwords.words("english"))

        stopWords.add(' ')

        col = col.lower() # all strings to lower

        col = re.sub(r'\\s{2,}', ' ', col) # turn two or more spaces into 1 space

        words = word_tokenize(col)

        updated_words = []
        for w in words:
            
            if w not in stopWords: # Only keep if word isn't a stop word
                
                w = wnl.lemmatize(w, self.get_wordnet_pos(w)) # get lemma of word
                
                updated_words.append(w)
                
        return updated_words                

dataEngineering1 = data_engineering_1()    

## Run Class

In [89]:
years = range(1996,2020)

BB_urls = list(map(dataEngineering1.year_link_create, years)) 

links = list(map(dataEngineering1.find_links, BB_urls))   

links = [item for sublist in links for item in sublist] # unnests lists
    
updated_lists = list(map(dataEngineering1.format_links, links))

print('Collected Links')

date, overallEconomicActivity, employmentPrices = dataEngineering1.pull_corpora_17_20(updated_lists)

beigeBookExtracts= pd.DataFrame({"Date":date,
                                 "OverallEconomicActivity":overallEconomicActivity,
                                 "EmploymentPrices":employmentPrices})

beigeBookExtracts['Date'] = beigeBookExtracts['Date'].apply(lambda x: dataEngineering1.str_to_datetime(x))

## Feature Creation ##

## Adding sentences columns
beigeBookExtracts['OverallEconomicActivity_sentences'] = beigeBookExtracts['OverallEconomicActivity'].apply(lambda x: sent_tokenize(x))

beigeBookExtracts['EmploymentPrices_sentences'] = beigeBookExtracts['EmploymentPrices'].apply(lambda x: sent_tokenize(x))

## Create Word Net Lemmatizer

wnl = WordNetLemmatizer()

## Adding lemmatized word columns

beigeBookExtracts['OverallEconomicActivity_words'] = beigeBookExtracts['OverallEconomicActivity'].apply(lambda x: dataEngineering1.remove_stop_lemma_words(x))

beigeBookExtracts['EmploymentPrices_words'] = beigeBookExtracts['EmploymentPrices'].apply(lambda x: dataEngineering1.remove_stop_lemma_words(x))

beigeBookExtracts.head(2)

Collected Links
Corpora from link 48 cleaned and collected
Corpora from link 49 cleaned and collected
Corpora from link 50 cleaned and collected
Corpora from link 51 cleaned and collected
Corpora from link 52 cleaned and collected
Corpora from link 53 cleaned and collected
Corpora from link 54 cleaned and collected
Corpora from link 55 cleaned and collected
Corpora from link 56 cleaned and collected
Corpora from link 57 cleaned and collected
Corpora from link 58 cleaned and collected
Corpora from link 59 cleaned and collected
Corpora from link 60 cleaned and collected
Corpora from link 61 cleaned and collected
Corpora from link 62 cleaned and collected
Corpora from link 63 cleaned and collected
Corpora from link 64 cleaned and collected
Corpora from link 65 cleaned and collected
Corpora from link 66 cleaned and collected
Corpora from link 67 cleaned and collected
Corpora from link 68 cleaned and collected
Corpora from link 69 cleaned and collected
Corpora from link 70 cleaned and colle

Unnamed: 0,Date,OverallEconomicActivity,EmploymentPrices,OverallEconomicActivity_sentences,EmploymentPrices_sentences,OverallEconomicActivity_words,EmploymentPrices_words
0,2017-01-18,Reports from the twelve Federal Reserve Distri...,Labor markets were reported to be tight or tig...,[Reports from the twelve Federal Reserve Distr...,[Labor markets were reported to be tight or ti...,"[report, twelve, federal, reserve, district, i...","[labor, market, report, tight, tighten, period..."
1,2017-03-01,Reports from all twelve Federal Reserve Distri...,"Labor markets remained tight in early 2017, wi...",[Reports from all twelve Federal Reserve Distr...,"[Labor markets remained tight in early 2017, w...","[report, twelve, federal, reserve, district, i...","[labor, market, remain, tight, early, 2017, ,,..."


## Write CSV

In [90]:
# beigeBookExtracts.to_csv('beigeBookExtracts.csv')