In [1]:
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/grivoire/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/grivoire/nltk_data...
[nltk_data] Downloading package punkt to /Users/grivoire/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='n') for word in words]

    return ' '.join(words)

In [3]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r"[\[]]", "", text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[%s]' % re.escape('!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~'), ' ', text)
    text = re.sub('(\\\\n)', "", text)
    text = re.sub('(\\\\r)', "", text)
    text = re.sub('(â\\\\)', "", text)
    text = re.sub('(â\\\\\\\\)', "", text)
    text = re.sub('(ä\\\\)', "", text)
    text = re.sub('(\\\\)', "", text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('(â\x80\x94)', '', text)
    text = re.sub('(\x80\x93)', '', text)
    text = re.sub('(ï»¿)', '', text)
    text = re.sub('(return to text)','',text)
    text = re.sub('(references).*$','',text)
    
    #need modification above to further improve text cleaning    
    return text

round1 = lambda x: clean_text_round1(x)

In [4]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text=  re.sub("'\\n\'", '', text)
#     text = re.sub("(, \)",'',text)
    text = re.sub("(\\r\\n\\n)",'',text)
    text = re.sub('(")','',text)
    text = re.sub("('t\\t)",'',text)
    text = re.sub("(n\\n)",'',text)
    text = re.sub("(\n)",'',text)
    text = re.sub("(\r\n\r\n)",'',text)
    text = re.sub("(click here for a highresolution version of this photo)",'',text)
    text = re.sub("(the federal reserve  the central bank of the united states  provides          the nation with a safe  flexible  and stable monetary and financial          system     federal open market committee    monetary policy principles and practice    policy implementation    reports    review of monetary policy strategy  tools  and                    communications    institution supervision    reports    reporting forms    supervision   regulation letters    banking applications   legal developments    regulatory resources    banking   data structure    financial stability assessments     financial stability coordination   actions    reports    regulations   statutes    payment policies    reserve bank payment services   data    financial market utilities   infrastructures    research  committees  and forums    working papers and notes    data  models and tools    bank assets and liabilities    bank structure data    business finance    dealer financing terms    exchange rates and international data    financial accounts    household finance    industrial activity    interest rates    micro data reference manual  mdrm     money stock and reserve balances    other    regulations      enforcement    community development      analysis    consumer resources)",'',text)
    text = re.sub("(the federal reserve  the central bank of the united states  provides          the nation with a safe  flexible  and stable monetary and financial          system     federal open market committee    monetary policy principles and practice    policy implementation    reports    review of monetary policy strategy  tools  and                    communications    institution supervision    reports    reporting forms    supervision   regulation letters    banking applications   legal developments    regulatory resources    banking   data structure     financial stability assessments     financial stability coordination   actions    reports    regulations   statutes    payment policies    reserve bank payment services   data    financial market utilities   infrastructures    research  committees  and forums    working papers and notes    data  models and tools    bank assets and liabilities    bank structure data    business finance    dealer financing terms    exchange rates and international data    financial accounts    household finance    industrial activity    interest rates    micro data reference manual  mdrm     money stock and reserve balances    other    regulations      enforcement    community development      analysis    consumer resources)",'',text)
    text = re.sub("(the federal reserve  the central bank of the united states  provides            the nation with a safe  flexible  and stable monetary and financial            system   federal open market committee  monetary policy principles and practice  policy implementation  reports  review of monetary policy strategy  tools  and                      communications  institution supervision  reports  reporting forms  supervision   regulation letters  banking applications   legal developments  regulatory resources  banking   data structure  financial stability assessments  financial stability coordination   actions  reports  regulations   statutes  payment policies  reserve bank payment services   data  financial market utilities   infrastructures  research  committees  and forums  working papers and notes  data  models and tools  bank assets and liabilities  bank structure data  business finance  dealer financing terms  exchange rates and international data  financial accounts  household finance  industrial activity  interest rates  micro data reference manual  mdrm   money stock and reserve balances  other  regulations  supervision  enforcement  community development  research  analysis  consumer resources)",'',text)
    text = re.sub("(prepared at the federal reserve bank of)",'',text)
    text = re.sub("(based on information collected on or before)",'',text)
    text = re.sub("(a meeting of the federal open market committee was held in       the offices of the board of governors of the federal reserve system in       washington  d c)",'',text)
    text = re.sub("(a joint meeting of the federal open market committee and the board of governors)",'',text)
    text = re.sub("(of the federal reserve system was held in the offices of the board of governors)",'',text)
    
    return text

round2 = lambda x: clean_text_round2(x)

# Cleanup for FOMC Statements

In [5]:
statement = pd.read_pickle('MonetaryPolicyCommittee.pkl')

In [6]:
statement

Unnamed: 0,FOMC_Statements
1994-02-04,"The Federal Reserve, the central bank of the U..."
1994-03-22,Chairman Alan Greenspan announced today that t...
1994-04-18,Chairman Alan Greenspan announced today that t...
1994-05-17,The Federal Reserve today announced two action...
1994-08-16,The Federal Reserve announced today the follow...
...,...
2022-05-04,"The Federal Reserve, the central bank of the U..."
2022-06-15,"The Federal Reserve, the central bank of the U..."
2022-07-27,"The Federal Reserve, the central bank of the U..."
2022-09-21,"The Federal Reserve, the central bank of the U..."


In [7]:
statement.iloc[12].FOMC_Statements

'For immediate release        The Federal Open Market Committee decided today to ease the stance of monetary policy slightly, expecting the federal funds rate to decline 1/4 percentage point to around 5-1/4 percent.    The action was taken to cushion the effects on prospective economic growth in the United States of increasing weakness in foreign economies and of less accommodative financial conditions domestically.  The recent changes in the global economy and adjustments in U.S. financial markets mean that a slightly lower federal funds rate should now be consistent with keeping inflation low and sustaining economic growth going forward.    The discount rate remains unchanged at 5 percent.                1998 Monetary policy   Home | News and events Accessibility  Last update: September 29, 1998, 2:15 PM  For immediate release        The Federal Open Market Committee decided today to ease the stance of monetary policy slightly, expecting the federal funds rate to decline 1/4 percenta

In [8]:
statement.rename(columns = {'FOMC_Statements':'text'},inplace =True)

In [9]:
statement['source'] = 'Statements'

In [10]:
statement['lemmatized'] = statement['text'].apply(lemmatize_words)

In [11]:
# Let's take a look at the updated text
statement_clean = pd.DataFrame(statement.text.apply(round1))
statement_clean

Unnamed: 0,text
1994-02-04,the federal reserve the central bank of the u...
1994-03-22,chairman alan greenspan announced today that t...
1994-04-18,chairman alan greenspan announced today that t...
1994-05-17,the federal reserve today announced two action...
1994-08-16,the federal reserve announced today the follow...
...,...
2022-05-04,the federal reserve the central bank of the u...
2022-06-15,the federal reserve the central bank of the u...
2022-07-27,the federal reserve the central bank of the u...
2022-09-21,the federal reserve the central bank of the u...


In [12]:
# Let's take a look at the updated text
statement_clean = pd.DataFrame(statement_clean.text.apply(round2))
statement_clean

Unnamed: 0,text
1994-02-04,december for release at p m est ...
1994-03-22,chairman alan greenspan announced today that t...
1994-04-18,chairman alan greenspan announced today that t...
1994-05-17,the federal reserve today announced two action...
1994-08-16,the federal reserve announced today the follow...
...,...
2022-05-04,march for release at p m edt ...
2022-06-15,june for release at p m edt ...
2022-07-27,july for release at p m edt ...
2022-09-21,january for release at p m est ...


In [13]:
statement_clean['source'] = 'Statements'

In [14]:
statement_clean['lemmatized'] = statement_clean['text'].apply(lemmatize_words)

In [15]:
statement.to_pickle('statement.pkl')
statement_clean.to_pickle('statement_clean.pkl')

# Cleanup for FOMC Minutes

In [16]:
minutes = pd.read_pickle('FederalReserveMins.pkl')

In [17]:
minutes

Unnamed: 0,Federal_Reserve_Mins
1995-02-01,A meeting of the Federal Open Market Committee...
1995-03-28,A meeting of the Federal Open Market Committee...
1995-05-23,A meeting of the Federal Open Market Committee...
1995-07-06,A meeting of the Federal Open Market Committee...
1995-08-22,A meeting of the Federal Open Market Committee...
...,...
2022-05-04,"The Federal Reserve, the central bank of the U..."
2022-06-15,"The Federal Reserve, the central bank of the U..."
2022-07-27,"The Federal Reserve, the central bank of the U..."
2022-09-21,"The Federal Reserve, the central bank of the U..."


In [18]:
minutes.iloc[-1].Federal_Reserve_Mins

'The Federal Reserve, the central bank of the United States, provides            the nation with a safe, flexible, and stable monetary and financial            system.  Federal Open Market Committee  Monetary Policy Principles and Practice  Policy Implementation  Reports  Review of Monetary Policy Strategy, Tools, and                      Communications  Institution Supervision  Reports  Reporting Forms  Supervision & Regulation Letters  Banking Applications & Legal Developments  Regulatory Resources  Banking & Data Structure  Financial Stability Assessments  Financial Stability Coordination & Actions  Reports  Regulations & Statutes  Payment Policies  Reserve Bank Payment Services & Data  Financial Market Utilities & Infrastructures  Research, Committees, and Forums  Working Papers and Notes  Data, Models and Tools  Bank Assets and Liabilities  Bank Structure Data  Business Finance  Dealer Financing Terms  Exchange Rates and International Data  Financial Accounts  Household Finance  I

In [19]:
minutes.rename(columns = {'Federal_Reserve_Mins':'text'},inplace =True)

In [20]:
minutes['source'] = 'Minutes'

In [21]:
minutes['lemmatized'] = minutes['text'].apply(lemmatize_words)

In [22]:
# Let's take a look at the updated text
minutes_clean = pd.DataFrame(minutes.text.apply(round1))
minutes_clean

Unnamed: 0,text
1995-02-01,a meeting of the federal open market committee...
1995-03-28,a meeting of the federal open market committee...
1995-05-23,a meeting of the federal open market committee...
1995-07-06,a meeting of the federal open market committee...
1995-08-22,a meeting of the federal open market committee...
...,...
2022-05-04,the federal reserve the central bank of the u...
2022-06-15,the federal reserve the central bank of the u...
2022-07-27,the federal reserve the central bank of the u...
2022-09-21,the federal reserve the central bank of the u...


In [90]:
minutes_clean['text'][0]

'   starting on tuesday  january    at   p m        and continuing on wednesday  february    at   a m   present   mr  greenspan  chairman        mr  mcdonough  vice chairman        mr  blinder        mr  hoenig        mr  kelley        mr  laware        mr  lindsey        mr  melzer        ms  minehan        mr  moskow        ms  phillips        ms  yellen  messrs  boehne  jordan  mcteer  and stern         alternate members of the federal open market        committee  messrs  broaddus  forrestal  and parry  presidents        of the federal reserve banks of richmond         atlanta  and san francisco  respectively  mr  kohn  secretary and economist        mr  bernard  deputy secretary        mr  coyne  assistant secretary        mr  gillum  assistant secretary        mr  mattingly  general counsel        mr  patrikis  deputy general counsel        mr  prell  economist        mr  truman  economist  messrs  davis  dewald  lindsey  mishkin  promisel         siegman  slifman  and stockton  

In [89]:
# Let's take a look at the updated text
minutes_clean = pd.DataFrame(minutes_clean.text.apply(round2))
minutes_clean

Unnamed: 0,text
1995-02-01,starting on tuesday january at p m ...
1995-03-28,on tuesday march at a m present...
1995-05-23,on tuesday may at a m present mr...
1995-07-06,on wednesday july at p m and ...
1995-08-22,on tuesday august at a m present ...
...,...
2022-05-04,may – on tuesday may at a m and...
2022-06-15,june – on tuesday june at a m a...
2022-07-27,july on tuesday july at a m a...
2022-09-21,september on tuesday september a...


In [25]:
minutes_clean.iloc[-2]['text']

'  september        on tuesday  september     at   p m  and continued on wednesday  september     at   a m   attendance   jerome h  powell  chair  john c  williams  vice chair  michael s  barr  michelle w  bowman  lael brainard  james bullard  susan m  collins  lisa d  cook  esther l  george  philip n  jefferson  loretta j  mester  christopher j  waller  charles l  evans  patrick harker  neel kashkari  lorie k  logan  and helen e  mucciolo  alternate members of the committee  thomas i  barkin  raphael w  bostic  and mary c  daly  presidents of the federal reserve banks of richmond  atlanta  and san francisco  respectively  james a  clouse  secretary  matthew m  luecke  deputy secretary  brian j  bonis  assistant secretary  michelle a  smith  assistant secretary  mark e  van der weide  general counsel  trevor a  reeve  economist  stacey tevlin  economist  beth anne wilson  economist  shaghil ahmed  joseph w  gruber  carlos garriga  and william wascher  associate economists  patricia zob

In [26]:
minutes_clean['source'] = 'Minutes'

In [27]:
minutes_clean['lemmatized'] = minutes_clean['text'].apply(lemmatize_words)

In [28]:
minutes.to_pickle('minutes.pkl')
minutes_clean.to_pickle('minutes_clean.pkl')

# Cleanup for Beige Books

In [30]:
beige = pd.read_csv('Beige_data.csv')
beige['Date'] = beige['date'].apply(lambda d: pd.Timestamp(d))


In [31]:
beige = beige.sort_values('Date').set_index('Date')

In [32]:
beige

Unnamed: 0_level_0,date,Beige_Book
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1996-10-30,1996-10-30,Prepared at the Federal Reserve Bank of Minnea...
1996-12-04,1996-12-04,Prepared at the Federal Reserve Bank of Philad...
1997-01-22,1997-01-22,Prepared at the Federal Reserve Bank of Atlant...
1997-03-12,1997-03-12,Prepared at the Federal Reserve Bank of St. Lo...
1997-05-07,1997-05-07,Prepared at the Federal Reserve Bank of Chicag...
...,...,...
2021-06-02,2021-06-02,Full Report\nNational Summary\nThis report was...
2021-07-14,2021-07-14,Full Report\nNational Summary\nThis report was...
2021-09-08,2021-09-08,Full Report\nNational Summary\nThis report was...
2021-10-20,2021-10-20,Full Report\nNational Summary\nThis report was...


In [33]:
beige.drop(columns = 'date',inplace=True)

In [34]:
beige.rename(columns = {'Beige_Book':'text'},inplace =True)

In [35]:
beige['source'] = 'Beige Book'

In [36]:
beige_2022 = pd.read_pickle('BeigeBook.pkl')
beige_2022['source'] = 'Beige Book'

In [37]:
beige_2022.rename(columns = {'Beige_Book':'text'},inplace =True)

In [38]:
beige_final = pd.concat([beige,beige_2022])
beige_final

Unnamed: 0,text,source
1996-10-30,Prepared at the Federal Reserve Bank of Minnea...,Beige Book
1996-12-04,Prepared at the Federal Reserve Bank of Philad...,Beige Book
1997-01-22,Prepared at the Federal Reserve Bank of Atlant...,Beige Book
1997-03-12,Prepared at the Federal Reserve Bank of St. Lo...,Beige Book
1997-05-07,Prepared at the Federal Reserve Bank of Chicag...,Beige Book
...,...,...
2022-06-01,"The Federal Reserve, the central bank of the U...",Beige Book
2022-07-01,"The Federal Reserve, the central bank of the U...",Beige Book
2022-09-01,"The Federal Reserve, the central bank of the U...",Beige Book
2022-10-01,"The Federal Reserve, the central bank of the U...",Beige Book


In [39]:
beige_final['lemmatized'] = beige_final['text'].apply(lemmatize_words)

In [40]:
# Let's take a look at the updated text
beige_clean = pd.DataFrame(beige_final.text.apply(round1))
beige_clean

Unnamed: 0,text
1996-10-30,prepared at the federal reserve bank of minnea...
1996-12-04,prepared at the federal reserve bank of philad...
1997-01-22,prepared at the federal reserve bank of atlant...
1997-03-12,prepared at the federal reserve bank of st lo...
1997-05-07,prepared at the federal reserve bank of chicag...
...,...
2022-06-01,the federal reserve the central bank of the u...
2022-07-01,the federal reserve the central bank of the u...
2022-09-01,the federal reserve the central bank of the u...
2022-10-01,the federal reserve the central bank of the u...


In [41]:
# Let's take a look at the updated text
beige_clean = pd.DataFrame(beige_clean.text.apply(round2))
beige_clean

Unnamed: 0,text
1996-10-30,minneapolis based on information collected be...
1996-12-04,philadelphia based on information collected b...
1997-01-22,atlanta and based on information collected be...
1997-03-12,st louis and based on information collected ...
1997-05-07,chicago and based on information collected be...
...,...
2022-06-01,this report was minneapolis april this...
2022-07-01,overall economic activity economic activity...
2022-09-01,overall economic activity national economic...
2022-10-01,this report was san francisco august t...


In [42]:
beige_clean['source'] = 'Beige Book'

In [43]:
beige_clean['lemmatized'] = beige_clean['text'].apply(lemmatize_words)

In [44]:
#sample doc with lots of growth keywords
beige_clean.iloc[-3]['text']



In [45]:
beige_clean

Unnamed: 0,text,source,lemmatized
1996-10-30,minneapolis based on information collected be...,Beige Book,minneapolis based on information collected bef...
1996-12-04,philadelphia based on information collected b...,Beige Book,philadelphia based on information collected be...
1997-01-22,atlanta and based on information collected be...,Beige Book,atlanta and based on information collected bef...
1997-03-12,st louis and based on information collected ...,Beige Book,st louis and based on information collected be...
1997-05-07,chicago and based on information collected be...,Beige Book,chicago and based on information collected bef...
...,...,...,...
2022-06-01,this report was minneapolis april this...,Beige Book,this report wa minneapolis april this document...
2022-07-01,overall economic activity economic activity...,Beige Book,overall economic activity economic activity ex...
2022-09-01,overall economic activity national economic...,Beige Book,overall economic activity national economic ac...
2022-10-01,this report was san francisco august t...,Beige Book,this report wa san francisco august this docum...


In [46]:
beige_final.to_pickle('beige_final.pkl')
beige_clean.to_pickle('beige_clean.pkl')

# Cleanup for FED Speeches 

In [None]:
# Load historical fed speeches from 1996-2020
fed_speech = pd.read_csv("fed_speeches_1996_2020.csv")
fed_speech["link"]

In [None]:
fed_speech.loc[fed_speech.date.isna() == True]

In [None]:
fed_speech.drop(720, axis = 0,inplace = True)

In [None]:
fed_speech = fed_speech.reset_index(drop = True)

In [None]:
fed_speech['timestamp'] = 0
for i in range(len(fed_speech.date)):
    fed_speech['timestamp'][i] = pd.to_datetime(str(int(fed_speech.date[i])),format = '%Y%m%d').date()

In [None]:
fed_speech = fed_speech.sort_values(['timestamp','speaker']).reset_index(drop=True)

In [None]:
fed_speech['Date'] = fed_speech['timestamp'].apply(lambda d: pd.Timestamp(d))

In [None]:
fed_speech

In [None]:
# Load Scraped Fed Speech From 2006 - 2022
fed_speech_new = pd.read_csv('fed_speech(2006-2022).csv')
fed_speech_new

In [None]:
fed_speech_new['timestamp'] = 0
for i in range(len(fed_speech_new.date)):
    fed_speech_new['timestamp'][i] = pd.to_datetime(fed_speech_new.date[i],format = '%m/%d/%Y').date()

In [None]:
fed_speech_new = fed_speech_new.sort_values(['timestamp','speaker']).reset_index(drop=True)

In [None]:
# Convert datetime to pandas timestamp
fed_speech_new['Date'] = fed_speech_new['timestamp'].apply(lambda d: pd.Timestamp(d))

In [None]:
fed_speech_new[fed_speech_new['Date'] > '2020-06']

In [None]:
# Combine the Fed speeches from Historical File + Scraped Text into one DataFrame
fed_combined = fed_speech[['Date','speaker','text','title','event','link']]
fed_combined

In [None]:
# Append new FED speeches from where the historical file left off (19 Jun 2020 Onwards) 
fed_append = fed_speech_new.iloc[811:][['Date','speaker','text','title','event','links_for_speeches']]
fed_append

In [None]:
fed_append.rename(columns={'links_for_speeches':'link'},inplace = True)

In [None]:
fed_final = pd.concat([fed_combined,fed_append]).reset_index(drop=True)
fed_final

In [None]:
# Review the merged section (from row 1454 onwards)
fed_final.iloc[1454:]

In [None]:
# Pickle the final, unprocessed text file
fed_final.to_pickle("fed_final.pkl")

In [48]:
# Load up pickle file
fed_final = pd.read_pickle("fed_final.pkl")

In [51]:
fed_final

Unnamed: 0,Date,speaker,text,title,event,link
0,1996-06-13,Chairman Alan Greenspan,Remarks by Chairman Alan Greenspan Bank superv...,Bank supervision in a world economy,At the International Conference of Banking Sup...,https://www.federalreserve.gov/boarddocs/speec...
1,1996-06-18,"Governor Edward W. Kelley, Jr.","Remarks by Governor Edward W. Kelley, Jr. Deve...",Developments in electronic money and banking,"At the CyberPayments '96 Conference, Dallas, T...",https://www.federalreserve.gov/boarddocs/speec...
2,1996-09-08,Governor Laurence H. Meyer,Monetary Policy Objectives and Strategy I wan...,Monetary policy objectives and strategy,At the National Association of Business Econom...,https://www.federalreserve.gov/boarddocs/speec...
3,1996-09-19,Chairman Alan Greenspan,Remarks by Chairman Alan Greenspan Regulation ...,Regulation and electronic payment systems,At the U.S. Treasury Conference on Electronic ...,https://www.federalreserve.gov/boarddocs/speec...
4,1996-10-02,Governor Lawrence B. Lindsey,Remarks by Governor Lawrence B. Lindsey At the...,Small business is big business,"At the Small Business Development Conference, ...",https://www.federalreserve.gov/boarddocs/speec...
...,...,...,...,...,...,...
1596,2022-11-17,Governor Philip N. Jefferson,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Opportunity and Inclusive Economic Growth,"At the 2022 Institute Research Conference, hos...",https://www.federalreserve.gov/newsevents/spee...
1597,2022-11-28,Vice Chair Lael Brainard,"['ï»¿\n', '\nThe Federal Reserve, the central ...",What Can We Learn from the Pandemic and the Wa...,At the 21st BIS Annual Conference Central Bank...,https://www.federalreserve.gov/newsevents/spee...
1598,2022-11-30,Chair Jerome H. Powell,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Inflation and the Labor Market,At the Hutchins Center on Fiscal and Monetary ...,https://www.federalreserve.gov/newsevents/spee...
1599,2022-11-30,Governor Lisa D. Cook,"['ï»¿\n', '\nThe Federal Reserve, the central ...",The Economic Outlook and U.S. Productivity,"At the Detroit Economic Club, Detroit, Michigan",https://www.federalreserve.gov/newsevents/spee...


In [52]:
speech = fed_final[['Date','text']]

In [53]:
speech['source'] = 'Speeches'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  speech['source'] = 'Speeches'


In [56]:
# Lemmatize uncleaned Fed speech
speech['lemmatized'] = speech['text'].apply(lemmatize_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  speech['lemmatized'] = speech['text'].apply(lemmatize_words)


In [59]:
speech.set_index('Date',inplace = True)

In [60]:
speech

Unnamed: 0_level_0,text,source,lemmatized
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1996-06-13,Remarks by Chairman Alan Greenspan Bank superv...,Speeches,Remarks by Chairman Alan Greenspan Bank superv...
1996-06-18,"Remarks by Governor Edward W. Kelley, Jr. Deve...",Speeches,"Remarks by Governor Edward W. Kelley, Jr. Deve..."
1996-09-08,Monetary Policy Objectives and Strategy I wan...,Speeches,Monetary Policy Objectives and Strategy I want...
1996-09-19,Remarks by Chairman Alan Greenspan Regulation ...,Speeches,Remarks by Chairman Alan Greenspan Regulation ...
1996-10-02,Remarks by Governor Lawrence B. Lindsey At the...,Speeches,Remarks by Governor Lawrence B. Lindsey At the...
...,...,...,...
2022-11-17,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-28,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-30,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-30,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."


In [62]:
# Let's take a look at the updated text
speech_clean = pd.DataFrame(speech.text.apply(round1))
speech_clean

Unnamed: 0_level_0,text
Date,Unnamed: 1_level_1
1996-06-13,remarks by chairman alan greenspan bank superv...
1996-06-18,remarks by governor edward w kelley jr deve...
1996-09-08,monetary policy objectives and strategy i wan...
1996-09-19,remarks by chairman alan greenspan regulation ...
1996-10-02,remarks by governor lawrence b lindsey at the...
...,...
2022-11-17,the federal reserve the central bank of...
2022-11-28,the federal reserve the central bank of...
2022-11-30,the federal reserve the central bank of...
2022-11-30,the federal reserve the central bank of...


In [63]:
# Let's take a look at the updated text
speech_clean = pd.DataFrame(speech_clean.text.apply(round2))
speech_clean

Unnamed: 0_level_0,text
Date,Unnamed: 1_level_1
1996-06-13,remarks by chairman alan greenspan bank superv...
1996-06-18,remarks by governor edward w kelley jr deve...
1996-09-08,monetary policy objectives and strategy i wan...
1996-09-19,remarks by chairman alan greenspan regulation ...
1996-10-02,remarks by governor lawrence b lindsey at the...
...,...
2022-11-17,november governor philip n je...
2022-11-28,november vice chair lael brain...
2022-11-30,november chair jerome h powel...
2022-11-30,november governor lisa d cook...


In [64]:
speech_clean['source'] = 'Speeches'

In [65]:
# Lemmatize Fed Speech
speech_clean['lemmatized'] = speech_clean['text'].apply(lemmatize_words)

In [66]:
speech_clean

Unnamed: 0_level_0,text,source,lemmatized
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1996-06-13,remarks by chairman alan greenspan bank superv...,Speeches,remark by chairman alan greenspan bank supervi...
1996-06-18,remarks by governor edward w kelley jr deve...,Speeches,remark by governor edward w kelley jr developm...
1996-09-08,monetary policy objectives and strategy i wan...,Speeches,monetary policy objective and strategy i want ...
1996-09-19,remarks by chairman alan greenspan regulation ...,Speeches,remark by chairman alan greenspan regulation o...
1996-10-02,remarks by governor lawrence b lindsey at the...,Speeches,remark by governor lawrence b lindsey at the s...
...,...,...,...
2022-11-17,november governor philip n je...,Speeches,november governor philip n jefferson at the in...
2022-11-28,november vice chair lael brain...,Speeches,november vice chair lael brainard at the bi an...
2022-11-30,november chair jerome h powel...,Speeches,november chair jerome h powell at the hutchins...
2022-11-30,november governor lisa d cook...,Speeches,november governor lisa d cook at the detroit e...


In [67]:
speech.to_pickle('speech_lemmatized.pkl')


In [68]:
speech_clean.to_pickle('speech_clean_lemmatized.pkl')

In [69]:
speech

Unnamed: 0_level_0,text,source,lemmatized
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1996-06-13,Remarks by Chairman Alan Greenspan Bank superv...,Speeches,Remarks by Chairman Alan Greenspan Bank superv...
1996-06-18,"Remarks by Governor Edward W. Kelley, Jr. Deve...",Speeches,"Remarks by Governor Edward W. Kelley, Jr. Deve..."
1996-09-08,Monetary Policy Objectives and Strategy I wan...,Speeches,Monetary Policy Objectives and Strategy I want...
1996-09-19,Remarks by Chairman Alan Greenspan Regulation ...,Speeches,Remarks by Chairman Alan Greenspan Regulation ...
1996-10-02,Remarks by Governor Lawrence B. Lindsey At the...,Speeches,Remarks by Governor Lawrence B. Lindsey At the...
...,...,...,...
2022-11-17,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-28,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-30,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."
2022-11-30,"['ï»¿\n', '\nThe Federal Reserve, the central ...",Speeches,"['ï»¿\n', '\nThe Federal Reserve, the central ..."


## Combine Beige Book, FED Statements, FED Speeches & FED Minutes into one dataframe

In [70]:
FED = pd.concat([beige_final,statement,speech,minutes])
FED

Unnamed: 0,text,source,lemmatized
1996-10-30,Prepared at the Federal Reserve Bank of Minnea...,Beige Book,Prepared at the Federal Reserve Bank of Minnea...
1996-12-04,Prepared at the Federal Reserve Bank of Philad...,Beige Book,Prepared at the Federal Reserve Bank of Philad...
1997-01-22,Prepared at the Federal Reserve Bank of Atlant...,Beige Book,Prepared at the Federal Reserve Bank of Atlant...
1997-03-12,Prepared at the Federal Reserve Bank of St. Lo...,Beige Book,Prepared at the Federal Reserve Bank of St. Lo...
1997-05-07,Prepared at the Federal Reserve Bank of Chicag...,Beige Book,Prepared at the Federal Reserve Bank of Chicag...
...,...,...,...
2022-05-04,"The Federal Reserve, the central bank of the U...",Minutes,"The Federal Reserve, the central bank of the U..."
2022-06-15,"The Federal Reserve, the central bank of the U...",Minutes,"The Federal Reserve, the central bank of the U..."
2022-07-27,"The Federal Reserve, the central bank of the U...",Minutes,"The Federal Reserve, the central bank of the U..."
2022-09-21,"The Federal Reserve, the central bank of the U...",Minutes,"The Federal Reserve, the central bank of the U..."


In [72]:
FED.sort_index(inplace=True)

In [73]:
FED.to_pickle('FED.pkl')

In [74]:
FED_clean = pd.concat([beige_clean,statement_clean,speech_clean,minutes_clean])
FED_clean.sort_index(inplace=True)
FED_clean

Unnamed: 0,text,source,lemmatized
1994-02-04,december for release at p m est ...,Statements,december for release at p m est share informat...
1994-03-22,chairman alan greenspan announced today that t...,Statements,chairman alan greenspan announced today that t...
1994-04-18,chairman alan greenspan announced today that t...,Statements,chairman alan greenspan announced today that t...
1994-05-17,the federal reserve today announced two action...,Statements,the federal reserve today announced two action...
1994-08-16,the federal reserve announced today the follow...,Statements,the federal reserve announced today the follow...
...,...,...,...
2022-11-17,november governor philip n je...,Speeches,november governor philip n jefferson at the in...
2022-11-28,november vice chair lael brain...,Speeches,november vice chair lael brainard at the bi an...
2022-11-30,november chair jerome h powel...,Speeches,november chair jerome h powell at the hutchins...
2022-11-30,november governor lisa d cook...,Speeches,november governor lisa d cook at the detroit e...


In [75]:
FED_clean.to_pickle('FED_clean.pkl')

In [77]:
# Test if we could filter by data sources
FED_clean[FED_clean['source'] == 'Speeches']

Unnamed: 0,text,source,lemmatized
1996-06-13,remarks by chairman alan greenspan bank superv...,Speeches,remark by chairman alan greenspan bank supervi...
1996-06-18,remarks by governor edward w kelley jr deve...,Speeches,remark by governor edward w kelley jr developm...
1996-09-08,monetary policy objectives and strategy i wan...,Speeches,monetary policy objective and strategy i want ...
1996-09-19,remarks by chairman alan greenspan regulation ...,Speeches,remark by chairman alan greenspan regulation o...
1996-10-02,remarks by governor lawrence b lindsey at the...,Speeches,remark by governor lawrence b lindsey at the s...
...,...,...,...
2022-11-17,november governor philip n je...,Speeches,november governor philip n jefferson at the in...
2022-11-28,november vice chair lael brain...,Speeches,november vice chair lael brainard at the bi an...
2022-11-30,november chair jerome h powel...,Speeches,november chair jerome h powell at the hutchins...
2022-11-30,november governor lisa d cook...,Speeches,november governor lisa d cook at the detroit e...
