In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class TextPreprocessingPipeline:
    def __init__(self):
        self.pipeline_functions = []
    def register(self, func):
        self.pipeline_functions.append(func)
    def execute(self, data):
        for func in self.pipeline_functions:
            func(data)
        return data
    def reset_pipeline(self):
        self.pipeline_functions = []

In [4]:
pipeline = TextPreprocessingPipeline()

In [10]:
df = pd.read_csv("Fed_Scrape-2015-2023.csv")

In [12]:
#Query Statements
df_statements = df.copy()

df_statements.head()

Unnamed: 0.1,Unnamed: 0,Date,Type,Text
0,0,20230412,0,The Federal Reserve on Wednesday released the ...
1,1,20230412,0,The minutes for each regularly scheduled meeti...
2,2,20230412,0,The minutes can be viewed on the Board's website.
3,3,20230412,0,"For media inquiries, e-mail [email protected] ..."
4,4,20230412,0,Minutes of the Federal Open Market Committee\r...


In [22]:
# Clean data
df_statements_group = df_statements.groupby('Date')['Text'].apply(' '.join).reset_index()

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Make sure to download the necessary resources
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text_lambda(text):
    text = text.lower()
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    filtered_tokens = [token.translate(table) for token in tokens if token.isalnum()]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

def clean_text(frame):
    print("Executing Clean_Text")
    frame['Text'] = frame['Text'].apply(lambda x: clean_text_lambda(x))
    return frame
    
pipeline.register(clean_text)

[nltk_data] Downloading package punkt to /Users/trungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/trungle/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [25]:
input_text = "Hello, I'm an AI language model! How can I help you today?"
cleaned_text = clean_text_lambda(input_text)
print(cleaned_text)

hello i an ai language model how can i help you today


In [26]:
str.maketrans('', '', string.punctuation)

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}