In [22]:
import PyPDF2
import pandas as pd
import re  # For regular expressions
import string  # For string operations and punctuation removal
from nltk.tokenize import word_tokenize  # For word tokenization
from nltk.corpus import stopwords  # For stop words
from nltk.stem import WordNetLemmatizer  # For word lemmatization
lemmatizer = WordNetLemmatizer()


In [23]:
pdf_files = ["data\speeches\Trump.pdf", 
             "data\speeches\Obama.pdf", 
             "data\speeches\Bush.pdf",
             "data\speeches\Biden.pdf"]  # Replace with your list of PDF files

pdf_texts = []  # List to store the text from each PDF

for pdf_file_path in pdf_files:
    pdf_file = open(pdf_file_path, "rb")

    pdf_reader = PyPDF2.PdfReader(pdf_file)

    num_pages = len(pdf_reader.pages)

    pdf_text = ""  # String to store the text from the current PDF

    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()
        pdf_text += page_text  # Append the page text to the PDF text

    pdf_texts.append(pdf_text)  # Append the PDF text to the list

    pdf_file.close()

In [24]:
print(pdf_texts)

["Trump  \n01/20/2017  \nInaugural Address  \nChief Justice Roberts, President Carter, President Clinton, President Bush, President Obama, \nfellow Americans, and people of the world: thank you.  \nWe, the citizens of America, are now joined in a great national effort to rebuild our country and \nto restore its promise for all of our people.  \nTogether, we will determine the course of America and the world for years to come.  \nWe will face challenges. We will confront hardships. But we will get the job done.  \nEvery four years, we gather on these steps to carry out the orderly and peaceful transfer of \npower, and we are grateful to President Obama and First Lady Michelle Obama for their gracious \naid throughout this transition. They have been magnificent.  \nToday’s ceremony, however, has very special meaning. Because today we are not merely \ntransferring power from one Administration to another, or from one party to another – but we are \ntransferring power from Washington, D.C.

In [25]:
df = pd.DataFrame(pdf_texts, columns=['text'])

In [17]:
#export as csv
df.to_csv('data\speeches\presidents_speeches.csv', index=False)

In [18]:
df.shape

(4, 1)

In [19]:
df = pd.read_csv('data\speeches\presidents_speeches.csv')
df.head()

Unnamed: 0,text
0,Trump \n01/20/2017 \nInaugural Address \nCh...
1,Obam a \n01/20/2009 \nInaugural Address \n \...
2,Bush \n01/20/2001 \nInaugural Address \n \n...
3,Biden \n01/20/2021 \nInaugural Address \nCh...


In [26]:
def text_cleaner(text):
    # Convert text to lowercase
    text = text.lower()
    text = re.sub('[\r\n|\r\n]+', '\n', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub('’', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)

    return cleaned_text

In [27]:
df['clean_speech'] = df['text'].apply(text_cleaner)

In [28]:
df

Unnamed: 0,text,clean_speech
0,Trump \n01/20/2017 \nInaugural Address \nCh...,trump inaugural address chief justice robert p...
1,Obam a \n01/20/2009 \nInaugural Address \n \...,obam inaugural address stand today humbled tas...
2,Bush \n01/20/2001 \nInaugural Address \n \n...,bush inaugural address president clinton disti...
3,Biden \n01/20/2021 \nInaugural Address \nCh...,biden inaugural address chief justice robert v...
